mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
More manifests, base modules and rename from archiver to extractor.
This commit is contained in:
13
src/auto_archiver/modules/html_formatter/__manifest__.py
Normal file
13
src/auto_archiver/modules/html_formatter/__manifest__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
m = {
|
||||
"name": "HTML Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "jinja2"],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
||||
99
src/auto_archiver/modules/html_formatter/html_formatter.py
Normal file
99
src/auto_archiver/modules/html_formatter/html_formatter.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
import mimetypes, os, pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
from loguru import logger
|
||||
import json
|
||||
import base64
|
||||
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.base_modules import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlFormatter(Formatter):
|
||||
name = "html_formatter"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
|
||||
# JinjaHelper class static methods are added as filters
|
||||
self.environment.filters.update({
|
||||
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
|
||||
})
|
||||
self.template = self.environment.get_template("html_template.html")
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
# }
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
if item.is_empty():
|
||||
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
|
||||
return
|
||||
|
||||
content = self.template.render(
|
||||
url=url,
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.metadata,
|
||||
version=__version__
|
||||
)
|
||||
|
||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
return final_media
|
||||
|
||||
|
||||
# JINJA helper filters
|
||||
class JinjaHelpers:
|
||||
@staticmethod
|
||||
def is_list(v) -> bool:
|
||||
return isinstance(v, list)
|
||||
|
||||
@staticmethod
|
||||
def is_video(s: str) -> bool:
|
||||
m = mimetypes.guess_type(s)[0]
|
||||
return "video" in (m or "")
|
||||
|
||||
@staticmethod
|
||||
def is_image(s: str) -> bool:
|
||||
m = mimetypes.guess_type(s)[0]
|
||||
return "image" in (m or "")
|
||||
|
||||
@staticmethod
|
||||
def is_audio(s: str) -> bool:
|
||||
m = mimetypes.guess_type(s)[0]
|
||||
return "audio" in (m or "")
|
||||
|
||||
@staticmethod
|
||||
def is_media(v) -> bool:
|
||||
return isinstance(v, Media)
|
||||
|
||||
@staticmethod
|
||||
def get_extension(filename: str) -> str:
|
||||
return os.path.splitext(filename)[1]
|
||||
|
||||
@staticmethod
|
||||
def quote(s: str) -> str:
|
||||
return quote(s)
|
||||
|
||||
@staticmethod
|
||||
def json_dump_b64(d: dict) -> str:
|
||||
j = json.dumps(d, indent=4, default=str)
|
||||
return base64.b64encode(j.encode()).decode()
|
||||
@@ -0,0 +1,332 @@
|
||||
{# templates/results.html #}
|
||||
{% import 'macros.html' as macros %}
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300italic,700,700italic">
|
||||
<title>{{ url }}</title>
|
||||
<style>
|
||||
html {
|
||||
font-family: 'Roboto', sans-serif;
|
||||
}
|
||||
|
||||
table {
|
||||
table-layout: fixed;
|
||||
width: 90%;
|
||||
}
|
||||
|
||||
table td {
|
||||
word-wrap: break-word;
|
||||
overflow-wrap: break-word;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
table,
|
||||
th,
|
||||
td {
|
||||
margin: auto;
|
||||
border: 1px solid;
|
||||
border-collapse: collapse;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
table.metadata td:first-child {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
table.content td:nth-child(2),
|
||||
.center {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.copy:hover {
|
||||
background: aliceblue;
|
||||
cursor: copy;
|
||||
}
|
||||
|
||||
#notification {
|
||||
position: fixed;
|
||||
right: 20px;
|
||||
top: 20px;
|
||||
background: aquamarine;
|
||||
box-shadow: 6px 8px 5px 0px #000000;
|
||||
padding: 10px;
|
||||
font-size: large;
|
||||
display: none;
|
||||
}
|
||||
|
||||
img,
|
||||
video {
|
||||
filter: gray;
|
||||
-webkit-filter: grayscale(1);
|
||||
filter: grayscale(1);
|
||||
}
|
||||
|
||||
/* Disable grayscale on hover */
|
||||
/* img:hover,
|
||||
video:hover {
|
||||
-webkit-filter: grayscale(0);
|
||||
filter: none;
|
||||
} */
|
||||
|
||||
|
||||
.collapsible {
|
||||
background-color: #777;
|
||||
color: white;
|
||||
cursor: pointer;
|
||||
padding: 5px;
|
||||
margin: 10px;
|
||||
width: 100%;
|
||||
border: none;
|
||||
text-align: left;
|
||||
outline: none;
|
||||
font-size: 15px;
|
||||
}
|
||||
|
||||
.active,
|
||||
.collapsible:hover {
|
||||
background-color: #555;
|
||||
}
|
||||
|
||||
.collapsible-content {
|
||||
padding: 0 18px;
|
||||
display: none;
|
||||
overflow: hidden;
|
||||
background-color: #f1f1f1;
|
||||
}
|
||||
|
||||
.pem-certificate, .text-preview {
|
||||
text-align: left;
|
||||
font-size: small;
|
||||
}
|
||||
.text-preview{
|
||||
padding-left: 10px;
|
||||
padding-right: 10px;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="notification"></div>
|
||||
<h2>Archived media for <span class="copy">{{ url }}</span> - <a href="{{ url }}">open</a></h2>
|
||||
{% if title | string | length > 0 %}
|
||||
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
|
||||
{% endif %}
|
||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||
<form class="center">
|
||||
<label>
|
||||
<input type="checkbox" id="safe-media-view" checked>
|
||||
Safe Media View
|
||||
</label>
|
||||
</form>
|
||||
<table class="content">
|
||||
<tr>
|
||||
<th>about</th>
|
||||
<th>files and preview</th>
|
||||
</tr>
|
||||
<tbody>
|
||||
{% for m in media %}
|
||||
<tr>
|
||||
<td>
|
||||
{{ macros.display_recursive(m, true) }}
|
||||
</td>
|
||||
<td>
|
||||
{{ macros.display_media(m, true, url) }}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
<h2 class="center">metadata</h2>
|
||||
<table class="metadata">
|
||||
<tr>
|
||||
<th>key</th>
|
||||
<th>value</th>
|
||||
</tr>
|
||||
{% for key in metadata %}
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>
|
||||
{% if metadata[key] is mapping %}
|
||||
<div class="center copy" copy-value64='{{metadata[key] | json_dump_b64}}'>Copy as JSON</div>
|
||||
{% endif %}
|
||||
{{ macros.copy_urlize(metadata[key]) }}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
|
||||
<p class="center">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>
|
||||
v{{ version }}</p>
|
||||
</body>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/forge/0.10.0/forge.min.js"></script>
|
||||
<script defer>
|
||||
// partial decode of SSL certificates
|
||||
function decodeCertificate(sslCert) {
|
||||
var cert = forge.pki.certificateFromPem(sslCert);
|
||||
return `SSL CERTIFICATE PREVIEW:<br/><ul>
|
||||
<li><b>Subject:</b> <span class="copy">${cert.subject.attributes.map(attr => `${attr.shortName}: ${attr.value}`).join(", ")}</span></li>
|
||||
<li><b>Issuer:</b> <span class="copy">${cert.issuer.attributes.map(attr => `${attr.shortName}: ${attr.value}`).join(", ")}</span></li>
|
||||
<li><b>Valid From:</b> <span class="copy">${cert.validity.notBefore}</span></li>
|
||||
<li><b>Valid To:</b> <span class="copy">${cert.validity.notAfter}</span></li>
|
||||
<li><b>Serial Number:</b> <span class="copy">${cert.serialNumber}</span></li>
|
||||
</ul>`;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
let setupFunctions = [
|
||||
previewCertificates,
|
||||
previewText,
|
||||
enableCopyLogic,
|
||||
enableCollapsibleLogic,
|
||||
setupSafeView
|
||||
];
|
||||
setupFunctions.forEach(async f => {
|
||||
try {
|
||||
await f();
|
||||
} catch (e) {
|
||||
console.error(`Error in ${f.name}: ${e}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function previewCertificates() {
|
||||
await Promise.all(
|
||||
Array.from(document.querySelectorAll(".pem-certificate")).map(async el => {
|
||||
let certificate = await (await fetch(el.getAttribute("pem"))).text();
|
||||
el.innerHTML = decodeCertificate(certificate);
|
||||
|
||||
let cyberChefUrl =
|
||||
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
|
||||
// create a new anchor with this url and append after the code
|
||||
let a = document.createElement("a");
|
||||
a.href = cyberChefUrl;
|
||||
a.textContent = "Full certificate details";
|
||||
el.parentElement.appendChild(a);
|
||||
})
|
||||
);
|
||||
console.log("certificate preview done");
|
||||
}
|
||||
|
||||
async function previewText() {
|
||||
await Promise.all(
|
||||
Array.from(document.querySelectorAll(".text-preview")).map(async el => {
|
||||
let textContent = await (await fetch(el.getAttribute("url"))).text();
|
||||
el.textContent = textContent;
|
||||
})
|
||||
);
|
||||
console.log("text preview done");
|
||||
}
|
||||
|
||||
// notification logic
|
||||
const notification = document.getElementById("notification");
|
||||
|
||||
function showNotification(message, miliseconds) {
|
||||
notification.style.display = "block";
|
||||
notification.innerText = message;
|
||||
setTimeout(() => {
|
||||
notification.style.display = "none";
|
||||
notification.innerText = "";
|
||||
}, miliseconds || 1000)
|
||||
}
|
||||
|
||||
// copy logic
|
||||
async function enableCopyLogic() {
|
||||
await Promise.all(
|
||||
Array.from(document.querySelectorAll(".copy")).map(el => {
|
||||
el.onclick = () => {
|
||||
document.execCommand("copy");
|
||||
}
|
||||
el.addEventListener("copy", (e) => {
|
||||
e.preventDefault();
|
||||
if (e.clipboardData) {
|
||||
if (el.hasAttribute("copy-value")) {
|
||||
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
|
||||
} else if (el.hasAttribute("copy-value64")) {
|
||||
// TODO: figure out how to decode unicode chars into utf-8
|
||||
e.clipboardData.setData("text/plain", new String(atob(el.getAttribute(
|
||||
"copy-value64"))));
|
||||
} else {
|
||||
e.clipboardData.setData("text/plain", el.textContent);
|
||||
}
|
||||
console.log(e.clipboardData.getData("text"))
|
||||
showNotification("copied!")
|
||||
}
|
||||
})
|
||||
})
|
||||
)
|
||||
console.log("copy logic enabled");
|
||||
}
|
||||
|
||||
// collapsibles
|
||||
async function enableCollapsibleLogic() {
|
||||
let coll = document.getElementsByClassName("collapsible");
|
||||
for (let i = 0; i < coll.length; i++) {
|
||||
await new Promise(resolve => {
|
||||
coll[i].addEventListener("click", function () {
|
||||
this.classList.toggle("active");
|
||||
// let content = this.nextElementSibling;
|
||||
let content = this.parentElement.querySelector(".collapsible-content");
|
||||
if (content.style.display === "block") {
|
||||
content.style.display = "none";
|
||||
} else {
|
||||
content.style.display = "block";
|
||||
}
|
||||
});
|
||||
resolve();
|
||||
})
|
||||
}
|
||||
console.log("collapsible logic enabled");
|
||||
}
|
||||
|
||||
async function setupSafeView() {
|
||||
// logic for enabled/disabled greyscale
|
||||
// Get references to the checkboxes and images/videos
|
||||
const safeImageViewCheckbox = document.getElementById('safe-media-view');
|
||||
const visualPreviews = document.querySelectorAll('img, video,embed');
|
||||
|
||||
// Function to toggle grayscale effect
|
||||
function toggleGrayscale() {
|
||||
visualPreviews.forEach(element => {
|
||||
if (safeImageViewCheckbox.checked) {
|
||||
// Enable grayscale effect
|
||||
element.style.filter = 'grayscale(1)';
|
||||
element.style.webkitFilter = 'grayscale(1)';
|
||||
} else {
|
||||
// Disable grayscale effect
|
||||
element.style.filter = 'none';
|
||||
element.style.webkitFilter = 'none';
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Add event listener to the checkbox to trigger the toggleGrayscale function
|
||||
safeImageViewCheckbox.addEventListener('change', toggleGrayscale);
|
||||
|
||||
// Handle the hover effect using JavaScript
|
||||
visualPreviews.forEach(element => {
|
||||
element.addEventListener('mouseenter', () => {
|
||||
// Disable grayscale effect on hover
|
||||
element.style.filter = 'none';
|
||||
element.style.webkitFilter = 'none';
|
||||
});
|
||||
|
||||
element.addEventListener('mouseleave', () => {
|
||||
// Re-enable grayscale effect if checkbox is checked
|
||||
if (safeImageViewCheckbox.checked) {
|
||||
element.style.filter = 'grayscale(1)';
|
||||
element.style.webkitFilter = 'grayscale(1)';
|
||||
}
|
||||
});
|
||||
});
|
||||
toggleGrayscale();
|
||||
console.log("grayscale logic enabled");
|
||||
}
|
||||
|
||||
run();
|
||||
</script>
|
||||
|
||||
</html>
|
||||
151
src/auto_archiver/modules/html_formatter/templates/macros.html
Normal file
151
src/auto_archiver/modules/html_formatter/templates/macros.html
Normal file
@@ -0,0 +1,151 @@
|
||||
{% macro display_media(m, links, main_url) -%}
|
||||
|
||||
{% for url in m.urls %}
|
||||
{% if url | length == 0 %}
|
||||
No URL available for {{ m.key }}.
|
||||
{% elif 'http://' in url or 'https://' in url or url.startswith('/') %}
|
||||
{% if 'image' in m.mimetype %}
|
||||
<div>
|
||||
<a href="{{ url }}">
|
||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
</a>
|
||||
|
||||
<div>
|
||||
Reverse Image Search:
|
||||
<a href="https://www.google.com/searchbyimage?sbisrc=4chanx&image_url={{ url | quote }}&safe=off">Google</a>,
|
||||
<a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,
|
||||
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,
|
||||
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,
|
||||
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
Image Forensics:
|
||||
<a href="https://fotoforensics.com/?url={{ url | quote }}">FotoForensics</a>,
|
||||
<a href="https://mever.iti.gr/forensics/?image={{ url }}">Media Verification Assistant</a>
|
||||
</div>
|
||||
<p></p>
|
||||
</div>
|
||||
{% elif 'video' in m.mimetype %}
|
||||
<div>
|
||||
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||
Your browser does not support the video element.
|
||||
</video>
|
||||
</div>
|
||||
{% elif 'application/pdf' in m.mimetype %}
|
||||
<div>
|
||||
<embed src="{{ url }}" width="100%" height="400px"/>
|
||||
</div>
|
||||
{% elif 'audio' in m.mimetype %}
|
||||
<div>
|
||||
<audio controls>
|
||||
<source src="{{ url }}" type="{{ m.mimetype }}">
|
||||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
</div>
|
||||
{% elif m.filename | get_extension == ".wacz" %}
|
||||
<a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a>
|
||||
|
||||
{% elif m.filename | get_extension == ".pem" %}
|
||||
<code class="pem-certificate" pem="{{url}}"></code>
|
||||
|
||||
{% elif 'text' in m.mimetype %}
|
||||
<div>PREVIEW:<br/><code><pre class="text-preview" url="{{url}}"></pre></code></div>
|
||||
|
||||
{% else %}
|
||||
No preview available for <code>{{ m.key }}</code>.
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{{ m.url | urlize }}
|
||||
{% endif %}
|
||||
{% if links %}
|
||||
<a href="{{ url }}">open</a> or
|
||||
<a href="{{ url }}" download="">download</a> or
|
||||
{{ copy_urlize(url, "copy") }}
|
||||
|
||||
<br>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{%- endmacro -%}
|
||||
|
||||
{% macro copy_urlize(val, href_text) -%}
|
||||
|
||||
{% if val | is_list %}
|
||||
{% for item in val %}
|
||||
{{ copy_urlize(item) }}
|
||||
{% endfor %}
|
||||
|
||||
{% elif val is mapping %}
|
||||
<ul>
|
||||
{% for key in val %}
|
||||
<li>
|
||||
<b>{{ key }}:</b> {{ copy_urlize(val[key]) }}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
{% else %}
|
||||
{% if href_text | length == 0 %}
|
||||
<span class="copy">{{ val | string | urlize }}</span>
|
||||
{% else %}
|
||||
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{%- endmacro -%}
|
||||
|
||||
|
||||
{% macro display_recursive(prop, skip_display) -%}
|
||||
{% if prop is mapping %}
|
||||
<div class="center copy" copy-value64='{{prop | json_dump_b64}}'>Copy as JSON</div>
|
||||
<ul>
|
||||
{% for subprop in prop %}
|
||||
<li>
|
||||
<b>{{ subprop }}:</b>
|
||||
{{ display_recursive(prop[subprop]) }}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
{% elif prop | is_list %}
|
||||
{% for item in prop %}
|
||||
<li>
|
||||
{{ display_recursive(item) }}
|
||||
</li>
|
||||
{% endfor %}
|
||||
|
||||
|
||||
{% elif prop | is_media %}
|
||||
{% if not skip_display %}
|
||||
{{ display_media(prop, true) }}
|
||||
{% endif %}
|
||||
<ul>
|
||||
<li><b>key:</b> <span class="copy">{{ prop.key }}</span></li>
|
||||
<li><b>type:</b> <span class="copy">{{ prop.mimetype }}</span></li>
|
||||
{% for subprop in prop.properties %}
|
||||
|
||||
|
||||
{% if prop.properties[subprop] | is_list %}
|
||||
<p></p>
|
||||
<div>
|
||||
<b class="collapsible" title="expand">{{ subprop }} ({{ prop.properties[subprop] | length }}):</b>
|
||||
<p></p>
|
||||
<div class="collapsible-content">
|
||||
{% for subsubprop in prop.properties[subprop] %}
|
||||
{{ display_recursive(subsubprop) }}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
<p></p>
|
||||
{% elif prop.properties[subprop] | string | length > 1 %}
|
||||
<li><b>{{ subprop }}:</b> {{ copy_urlize(prop.properties[subprop]) }}</li>
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
|
||||
</ul>
|
||||
{% else %}
|
||||
{{ copy_urlize(prop) }}
|
||||
{% endif %}
|
||||
{%- endmacro -%}
|
||||
Reference in New Issue
Block a user