mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
html template working with jinja templates
This commit is contained in:
@@ -6,6 +6,7 @@ from typing import List
|
||||
from archivers import Archiverv2
|
||||
from feeders import Feeder
|
||||
from databases import Database
|
||||
from formatters import Formatter
|
||||
from storages import StorageV2
|
||||
from steps.step import Step
|
||||
from enrichers import Enricher
|
||||
@@ -21,13 +22,14 @@ class ConfigV2:
|
||||
Enricher,
|
||||
Archiverv2,
|
||||
Database,
|
||||
StorageV2
|
||||
StorageV2,
|
||||
Formatter
|
||||
# Util
|
||||
]
|
||||
feeder: Step # TODO:= BaseFeeder
|
||||
formatter: Formatter
|
||||
archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type
|
||||
enrichers: List[Enricher] = field(default_factory=[])
|
||||
formatters: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||
storages: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||
databases: List[Database] = field(default_factory=[])
|
||||
|
||||
@@ -50,6 +52,7 @@ class ConfigV2:
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
for child in configurable.__subclasses__():
|
||||
assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
|
||||
for config, details in child.configs().items():
|
||||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
@@ -87,6 +90,7 @@ class ConfigV2:
|
||||
# print("config.py", self.config)
|
||||
|
||||
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
|
||||
self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config)
|
||||
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
|
||||
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
|
||||
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
|
||||
@@ -97,6 +101,7 @@ class ConfigV2:
|
||||
print("archivers", [e for e in self.archivers])
|
||||
print("databases", [e for e in self.databases])
|
||||
print("storages", [e for e in self.storages])
|
||||
print("formatter", self.formatter)
|
||||
|
||||
def validate(self):
|
||||
pass
|
||||
|
||||
2
src/formatters/__init__.py
Normal file
2
src/formatters/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .formatter import Formatter
|
||||
from .html_formatter import HtmlFormatter
|
||||
21
src/formatters/formatter.py
Normal file
21
src/formatters/formatter.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
|
||||
|
||||
@dataclass
|
||||
class Formatter(Step):
|
||||
name = "formatter"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def init(name: str, config: dict) -> Formatter:
|
||||
# only for code typing
|
||||
return Step.init(name, config, Formatter)
|
||||
|
||||
@abstractmethod
|
||||
def format(self, item) -> Metadata: return None
|
||||
36
src/formatters/html_formatter.py
Normal file
36
src/formatters/html_formatter.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from formatters import Formatter
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
import uuid, os, pathlib
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlFormatter(Formatter):
|
||||
name = "html_formatter"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
|
||||
self.template = self.environment.get_template("html_template.html")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
print("FORMATTING")
|
||||
content = self.template.render(
|
||||
url=item.get_url(),
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.get_clean_metadata()
|
||||
)
|
||||
html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
return Media(filename=html_path)
|
||||
101
src/formatters/templates/html_template.html
Normal file
101
src/formatters/templates/html_template.html
Normal file
@@ -0,0 +1,101 @@
|
||||
{# templates/results.html #}
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300italic,700,700italic">
|
||||
<title>{{ url }}</title>
|
||||
<style>
|
||||
html {
|
||||
font-family: 'Roboto', sans-serif;
|
||||
}
|
||||
|
||||
table {
|
||||
table-layout: fixed;
|
||||
width: 90%;
|
||||
}
|
||||
|
||||
table td {
|
||||
word-wrap: break-word;
|
||||
overflow-wrap: break-word;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
table,
|
||||
th,
|
||||
td {
|
||||
border: 1px solid;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
table.metadata td:first-child {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
table.content td:nth-child(2),
|
||||
.center {
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
||||
<p>title: '<span>{{ title }}</span>'</p>
|
||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||
<table class="content">
|
||||
<tr>
|
||||
<th>about</th>
|
||||
<th>preview</th>
|
||||
</tr>
|
||||
{% for m in media %}
|
||||
<tr>
|
||||
<td>
|
||||
<ul>
|
||||
<li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
|
||||
{% if m.hash | length > 1 %}
|
||||
<li>hash: <span>{{ m.hash }}</span></li>
|
||||
{% endif %}
|
||||
<li>key: <span>{{ m.key }}</span></li>
|
||||
<li>type: <span>{{ m.mimetype }}</span></li>
|
||||
</ul>
|
||||
|
||||
</td>
|
||||
<td>
|
||||
{% if 'image' in m.mimetype %}
|
||||
<img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
{% elif 'video' in m.mimetype %}
|
||||
<video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
|
||||
Your browser does not support the video element.
|
||||
</video>
|
||||
{% elif 'audio' in m.mimetype %}
|
||||
<audio controls>
|
||||
<source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
|
||||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
{% else %}
|
||||
No preview available, please open the link.
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
<h2>metadata</h2>
|
||||
<table class="metadata">
|
||||
<tr>
|
||||
<th>key</th>
|
||||
<th>value</th>
|
||||
</tr>
|
||||
{% for key in metadata %}
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>{{ metadata[key] }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
@@ -3,8 +3,7 @@ from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Any, Union, Dict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import json
|
||||
import mimetypes
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -12,5 +11,11 @@ class Media:
|
||||
filename: str
|
||||
key: str = None
|
||||
cdn_url: str = None
|
||||
mimetype: str = None # eg: image/jpeg
|
||||
# id: str = None
|
||||
# hash: str = None # TODO: added by enrichers
|
||||
|
||||
def set_mimetype(self) -> Media:
|
||||
if not self.mimetype:
|
||||
self.mimetype = mimetypes.guess_type(self.filename)[0]
|
||||
return self
|
||||
|
||||
@@ -3,7 +3,8 @@ from __future__ import annotations
|
||||
from ast import List, Set
|
||||
from typing import Any, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
import datetime
|
||||
import datetime, mimetypes
|
||||
from loguru import logger
|
||||
# import json
|
||||
|
||||
from media import Media
|
||||
@@ -12,9 +13,11 @@ from media import Media
|
||||
@dataclass
|
||||
class Metadata:
|
||||
status: str = ""
|
||||
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs
|
||||
media: List[Media] = field(default_factory=list)
|
||||
final_media: Media = None # can be overwritten by formatters
|
||||
rearchivable: bool = False
|
||||
|
||||
# def __init__(self, url, metadata = {}) -> None:
|
||||
@@ -85,13 +88,20 @@ class Metadata:
|
||||
return ts
|
||||
|
||||
def add_media(self, media: Media) -> Metadata:
|
||||
# print(f"adding {filename} to {self.metadata.get('media')}")
|
||||
# return self.set("media", self.get_media() + [filename])
|
||||
# return self.get_media().append(media)
|
||||
media.set_mimetype()
|
||||
return self.media.append(media)
|
||||
|
||||
def set_final_media(self, final: Media) -> Metadata:
|
||||
if final:
|
||||
if self.final_media:
|
||||
logger.warning(f"overwriting final media value :{self.final_media} with {final}")
|
||||
final.set_mimetype()
|
||||
self.final_media = final
|
||||
return self
|
||||
|
||||
def get_single_media(self) -> Media:
|
||||
# TODO: check if formatters were applied and choose with priority
|
||||
if self.final_media:
|
||||
return self.final_media
|
||||
return self.media[0]
|
||||
|
||||
# def as_json(self) -> str:
|
||||
@@ -99,6 +109,12 @@ class Metadata:
|
||||
# return json.dumps(self.metadata)
|
||||
# #TODO: datetime is not serializable
|
||||
|
||||
def get_clean_metadata(self) -> Metadata:
|
||||
return dict(
|
||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
||||
**{"processed_at": self._processed_at} # TODO: move to enrichment
|
||||
)
|
||||
|
||||
def cleanup(self) -> Metadata:
|
||||
# TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
|
||||
# the code below leads to errors if database needs tmp_keys after they are removed
|
||||
|
||||
@@ -4,6 +4,8 @@ from typing import Union, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
from archivers import Archiverv2
|
||||
from feeders import Feeder
|
||||
from formatters import Formatter
|
||||
from storages import StorageV2
|
||||
from enrichers import Enricher
|
||||
from databases import Database
|
||||
@@ -13,7 +15,6 @@ import tempfile, time, traceback
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
||||
"""
|
||||
how not to couple the different pieces of logic
|
||||
due to the use of constants for the metadata keys?
|
||||
@@ -132,7 +133,8 @@ class ArchivingOrchestrator:
|
||||
# Archiver.init(a, config)
|
||||
# for a in config.archivers
|
||||
# ]
|
||||
self.feeder = config.feeder
|
||||
self.feeder : Feeder = config.feeder
|
||||
self.formatter : Formatter = config.formatter
|
||||
self.enrichers = config.enrichers
|
||||
self.archivers: List[Archiverv2] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
@@ -237,14 +239,21 @@ class ArchivingOrchestrator:
|
||||
for e in self.enrichers:
|
||||
result.merge(e.enrich(result))
|
||||
|
||||
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
||||
for f in self.formatters:
|
||||
result.merge(f.format(result))
|
||||
|
||||
# storage
|
||||
# store media
|
||||
unstored_media = result.media[::]
|
||||
result.media = []
|
||||
for s in self.storages:
|
||||
for i, m in enumerate(result.media):
|
||||
result.media[i] = s.store(m, result)
|
||||
for m in unstored_media:
|
||||
result.media.append(s.store(m, result))
|
||||
|
||||
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
||||
# TODO: should there only be 1 formatter?
|
||||
# for f in self.formatters:
|
||||
# result.merge(f.format(result))
|
||||
# final format and store it
|
||||
if (final_media := self.formatter.format(result)):
|
||||
for s in self.storages:
|
||||
result.set_final_media(s.store(final_media, result))
|
||||
|
||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||
# a hash registration service could be one database: forensic archiving
|
||||
|
||||
@@ -12,6 +12,7 @@ class Gsheets(Step):
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
|
||||
Reference in New Issue
Block a user