thumbnails enricher

This commit is contained in:
msramalho
2023-01-17 16:29:27 +00:00
parent 74e50eccf1
commit 47dc788143
8 changed files with 208 additions and 35 deletions

View File

@@ -1,4 +1,5 @@
from .enricher import Enricher
from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackEnricher
from .hash_enricher import HashEnricher
from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher

View File

@@ -0,0 +1,46 @@
import uuid
from media import Media
from . import Enricher
from metadata import Metadata
from loguru import logger
import ffmpeg, os
class ThumbnailEnricher(Enricher):
"""
Generates thumbnails for all the media
"""
name = "thumbnail_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def enrich(self, to_enrich: Metadata) -> None:
logger.debug(f"generating thumbnails")
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
os.makedirs(folder, exist_ok=True)
for i, m in enumerate(to_enrich.media[::]):
if m.is_video():
logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration")
if duration is not None:
duration = float(duration)
if duration < 60: fps = 10.0 / duration
elif duration < 120: fps = 20.0 / duration
else: fps = 40.0 / duration
stream = ffmpeg.input(m.filename)
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
stream.output(os.path.join(folder, 'out%d.jpg')).run()
thumbnails = os.listdir(folder)
thumbnails_media = []
for t, fname in enumerate(thumbnails):
if fname[-3:] == 'jpg':
thumbnails_media.append(Media(filename=os.path.join(folder, fname)).set("id", f"thumbnail_{t}"))
to_enrich.media[i].set("thumbnails", thumbnails_media)

View File

@@ -37,7 +37,7 @@ class WaybackEnricher(Enricher):
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}")
logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
to_enrich.set("wayback", em)
return
@@ -66,3 +66,4 @@ class WaybackEnricher(Enricher):
to_enrich.set("wayback", wayback_url)
else:
to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
to_enrich.set("wayback lookup", f"https://web.archive.org/web/*/{url}")

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
import mimetypes
from metadata import Metadata
from media import Media
from formatters import Formatter
@@ -16,14 +17,28 @@ class HtmlFormatter(Formatter):
# without this STEP.__init__ is not called
super().__init__(config)
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
self.environment.filters.update({
'is_list': is_list_jinja,
'is_video': is_video_jinja,
'is_image': is_image_jinja,
'is_audio': is_audio_jinja,
'is_media': is_media_jinja,
})
self.template = self.environment.get_template("html_template.html")
@staticmethod
def configs() -> dict:
return {}
return {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"},
}
def format(self, item: Metadata) -> Media:
print("FORMATTING")
media = item.media
# thumbnails
# TODO: thumbnails_media work per media, gah
# if self.detect_thumbnails:
content = self.template.render(
url=item.get_url(),
title=item.get_title(),
@@ -34,3 +49,28 @@ class HtmlFormatter(Formatter):
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
return Media(filename=html_path)
# JINJA helper filters
def is_list_jinja(v) -> bool:
return isinstance(v, list)
def is_video_jinja(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "video" in (m or "")
def is_image_jinja(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "image" in (m or "")
def is_audio_jinja(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "audio" in (m or "")
def is_media_jinja(v) -> bool:
return isinstance(v, Media)

View File

@@ -1,5 +1,5 @@
{# templates/results.html #}
{% import 'media.html' as macros %}
<!DOCTYPE html>
<html lang="en">
@@ -55,6 +55,45 @@
font-size: large;
display: none;
}
img,
video {
filter: gray;
-webkit-filter: grayscale(1);
filter: grayscale(1);
}
/* Disable grayscale on hover */
img:hover,
video:hover {
-webkit-filter: grayscale(0);
filter: none;
}
.collapsible {
background-color: #777;
color: white;
cursor: pointer;
padding: 5px;
margin: 10px;
width: 100%;
border: none;
text-align: left;
outline: none;
font-size: 15px;
}
.active,
.collapsible:hover {
background-color: #555;
}
.collapsible-content {
padding: 0 18px;
display: none;
overflow: hidden;
background-color: #f1f1f1;
}
</style>
</head>
@@ -76,37 +115,31 @@
<li><b>type:</b> <span class="copy">{{ m.mimetype }}</span></li>
{% for prop in m.properties %}
{% if m.properties[prop] | length > 1 %}
{% if m.properties[prop] | is_list %}
<p></p>
<div>
<b class="collapsible" title="expand">{{ prop }}:</b>
<div class="collapsible-content">
{% for subprop in m.properties[prop] %}
{% if subprop | is_media %}
{{ macros.display_media(subprop) }}
{% else %}
{{ subprop }}
{% endif %}
{% endfor %}
</div>
</div>
<p></p>
{% elif m.properties[prop] | length > 1 %}
<li><b>{{ prop }}:</b> <span class="copy">{{ m.properties[prop] }}</span></li>
{% endif %}
{% endfor %}
</ul>
</td>
<td>
{% for url in m.urls %}
{% if 'http' in url %}
{% if 'image' in m.mimetype %}
<a href="{{ url }}">
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
{% elif 'video' in m.mimetype %}
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
</video>
{% elif 'audio' in m.mimetype %}
<audio controls>
<source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element.
</audio>
{% else %}
No preview available.
{% endif %}
{% endif %}
<br>
<a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a>
{% endfor %}
{{ macros.display_media(m) }}
</td>
</tr>
{% endfor %}
@@ -153,10 +186,27 @@
if (e.clipboardData) {
e.clipboardData.setData("text/plain", el.textContent);
console.log(e.clipboardData.getData("text"))
showNotification("copied...")
showNotification("copied!")
}
})
})
// collapsibles
let coll = document.getElementsByClassName("collapsible");
let i;
for (i = 0; i < coll.length; i++) {
coll[i].addEventListener("click", function() {
this.classList.toggle("active");
// let content = this.nextElementSibling;
let content = this.parentElement.querySelector(".collapsible-content");
if (content.style.display === "block") {
content.style.display = "none";
} else {
content.style.display = "block";
}
});
}
</script>
</html>

View File

@@ -0,0 +1,28 @@
{% macro display_media(m) -%}
{% for url in m.urls %}
{% if url | length == 0 %}
No URL available for {{ m.key }}.
{% elif 'http' in url %}
{% if 'image' in m.mimetype %}
<a href="{{ url }}">
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
{% elif 'video' in m.mimetype %}
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
</video>
{% elif 'audio' in m.mimetype %}
<audio controls>
<source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element.
</audio>
{% else %}
No preview available for {{ m.key }}.
{% endif %}
{% else %}
{{ m.url | urlize }}
{% endif %}
{% endfor %}
{%- endmacro -%}

View File

@@ -1,18 +1,17 @@
from __future__ import annotations
from ast import List
from typing import Any, Union, Dict
from typing import Any
from dataclasses import dataclass, field
import mimetypes
@dataclass
class Media:
# other properties eg: hash, id, exif, ...
filename: str
key: str = None
_mimetype: str = None # eg: image/jpeg
urls: List[str] = field(default_factory=list)
_mimetype: str = None # eg: image/jpeg
properties: dict = field(default_factory=dict)
def set(self, key: str, value: Any) -> Media:
@@ -38,4 +37,4 @@ class Media:
self._mimetype = v
def is_video(self) -> bool:
return self._mimetype.startswith("video")
return self.mimetype.startswith("video")

View File

@@ -6,6 +6,7 @@ from dataclasses import dataclass
from archivers import Archiverv2
from feeders import Feeder
from formatters import Formatter
from media import Media
from storages import StorageV2
from enrichers import Enricher
from databases import Database
@@ -177,6 +178,13 @@ class ArchivingOrchestrator:
for s in self.storages:
for m in result.media:
s.store(m, result) # modifies media
# Media can be inside media properties, examples include transformations on original media
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
# TODO: should there only be 1 formatter?