mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
31c07a02e1 | ||
|
|
bd231488ff | ||
|
|
fb197f1064 | ||
|
|
ec1a78e973 | ||
|
|
139bdec051 | ||
|
|
f15a70f859 | ||
|
|
419eaef449 | ||
|
|
1695954c98 | ||
|
|
aa71c85a98 | ||
|
|
7a5c9c65bd | ||
|
|
fc93ebaba0 | ||
|
|
1b44a302cd | ||
|
|
1368f7aebc | ||
|
|
e3a0003a47 | ||
|
|
59551b3b20 | ||
|
|
f086d89111 | ||
|
|
3dd3775cbd |
@@ -9,7 +9,7 @@ RUN pip install --upgrade pip && \
|
|||||||
pip install pipenv && \
|
pip install pipenv && \
|
||||||
add-apt-repository ppa:mozillateam/ppa && \
|
add-apt-repository ppa:mozillateam/ppa && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y gcc ffmpeg fonts-noto && \
|
apt-get install -y gcc ffmpeg fonts-noto exiftool && \
|
||||||
apt-get install -y --no-install-recommends firefox-esr && \
|
apt-get install -y --no-install-recommends firefox-esr && \
|
||||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ steps:
|
|||||||
# - wacz_archiver_enricher
|
# - wacz_archiver_enricher
|
||||||
enrichers:
|
enrichers:
|
||||||
- hash_enricher
|
- hash_enricher
|
||||||
|
# - metadata_enricher
|
||||||
# - screenshot_enricher
|
# - screenshot_enricher
|
||||||
# - thumbnail_enricher
|
# - thumbnail_enricher
|
||||||
# - wayback_archiver_enricher
|
# - wayback_archiver_enricher
|
||||||
|
|||||||
@@ -27,11 +27,6 @@ class Archiver(Step):
|
|||||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# archivers can signal if it does not make sense to rearchive a piece of content
|
|
||||||
# default is rearchiving
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _guess_file_type(self, path: str) -> str:
|
def _guess_file_type(self, path: str) -> str:
|
||||||
"""
|
"""
|
||||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||||
@@ -56,6 +51,7 @@ class Archiver(Step):
|
|||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||||
}
|
}
|
||||||
d = requests.get(url, headers=headers)
|
d = requests.get(url, headers=headers)
|
||||||
|
assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
|
||||||
with open(to_filename, 'wb') as f:
|
with open(to_filename, 'wb') as f:
|
||||||
f.write(d.content)
|
f.write(d.content)
|
||||||
return to_filename
|
return to_filename
|
||||||
|
|||||||
@@ -19,10 +19,6 @@ class TelegramArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# telegram posts are static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
@@ -57,10 +53,10 @@ class TelegramArchiver(Archiver):
|
|||||||
|
|
||||||
if not len(image_urls): return False
|
if not len(image_urls): return False
|
||||||
for img_url in image_urls:
|
for img_url in image_urls:
|
||||||
result.add_media(Media(self.download_from_url(img_url)))
|
result.add_media(Media(self.download_from_url(img_url, item=item)))
|
||||||
else:
|
else:
|
||||||
video_url = video.get('src')
|
video_url = video.get('src')
|
||||||
m_video = Media(self.download_from_url(video_url))
|
m_video = Media(self.download_from_url(video_url, item=item))
|
||||||
# extract duration from HTML
|
# extract duration from HTML
|
||||||
try:
|
try:
|
||||||
duration = s.find_all('time')[0].contents[0]
|
duration = s.find_all('time')[0].contents[0]
|
||||||
|
|||||||
@@ -38,10 +38,6 @@ class TelethonArchiver(Archiver):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# telegram posts are static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
"""
|
"""
|
||||||
1. trigger login process for telegram or proceed if already saved in a session file
|
1. trigger login process for telegram or proceed if already saved in a session file
|
||||||
|
|||||||
@@ -16,10 +16,6 @@ class TiktokArchiver(Archiver):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# TikTok posts are static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
if 'tiktok.com' not in url:
|
if 'tiktok.com' not in url:
|
||||||
|
|||||||
@@ -37,10 +37,6 @@ class TwitterArchiver(Archiver):
|
|||||||
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
# https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
|
||||||
return self.link_clean_pattern.sub("\\1", url)
|
return self.link_clean_pattern.sub("\\1", url)
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# Twitter posts are static (for now)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
if this url is archivable will download post info and look for other posts from the same group with media.
|
if this url is archivable will download post info and look for other posts from the same group with media.
|
||||||
@@ -78,7 +74,7 @@ class TwitterArchiver(Archiver):
|
|||||||
media.set("src", variant.url)
|
media.set("src", variant.url)
|
||||||
mimetype = variant.contentType
|
mimetype = variant.contentType
|
||||||
elif type(tweet_media) == Photo:
|
elif type(tweet_media) == Photo:
|
||||||
media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig'))
|
media.set("src", UrlUtil.twitter_best_quality_url(tweet_media.fullUrl))
|
||||||
mimetype = "image/jpeg"
|
mimetype = "image/jpeg"
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Could not get media URL of {tweet_media}")
|
logger.warning(f"Could not get media URL of {tweet_media}")
|
||||||
@@ -118,6 +114,7 @@ class TwitterArchiver(Archiver):
|
|||||||
|
|
||||||
for i, u in enumerate(urls):
|
for i, u in enumerate(urls):
|
||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
|
u = UrlUtil.twitter_best_quality_url(u)
|
||||||
media.set("src", u)
|
media.set("src", u)
|
||||||
ext = ""
|
ext = ""
|
||||||
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
||||||
|
|||||||
@@ -27,10 +27,6 @@ class VkArchiver(Archiver):
|
|||||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
|
||||||
# VK content is static
|
|
||||||
return False
|
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
url = item.get_url()
|
url = item.get_url()
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,15 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
|
import ffmpeg
|
||||||
|
from ffmpeg._run import Error
|
||||||
|
|
||||||
from .context import ArchivingContext
|
from .context import ArchivingContext
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -77,3 +82,20 @@ class Media:
|
|||||||
|
|
||||||
def is_image(self) -> bool:
|
def is_image(self) -> bool:
|
||||||
return self.mimetype.startswith("image")
|
return self.mimetype.startswith("image")
|
||||||
|
|
||||||
|
def is_valid_video(self) -> bool:
|
||||||
|
# checks for video streams with ffmpeg, or min file size for a video
|
||||||
|
# self.is_video() should be used together with this method
|
||||||
|
try:
|
||||||
|
streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
|
||||||
|
logger.warning(f"STREAMS FOR {self.filename} {streams}")
|
||||||
|
return any(s.get("duration_ts") > 0 for s in streams)
|
||||||
|
except Error: return False # ffmpeg errors when reading bad files
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
try:
|
||||||
|
fsize = os.path.getsize(self.filename)
|
||||||
|
return fsize > 20_000
|
||||||
|
except: pass
|
||||||
|
return True
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import hashlib
|
||||||
from typing import Any, List, Union, Dict
|
from typing import Any, List, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
@@ -16,7 +17,6 @@ class Metadata:
|
|||||||
status: str = "no archiver"
|
status: str = "no archiver"
|
||||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||||
media: List[Media] = field(default_factory=list)
|
media: List[Media] = field(default_factory=list)
|
||||||
rearchivable: bool = True # defaults to true, archivers can overwrite
|
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.set("_processed_at", datetime.datetime.utcnow())
|
self.set("_processed_at", datetime.datetime.utcnow())
|
||||||
@@ -29,7 +29,6 @@ class Metadata:
|
|||||||
if overwrite_left:
|
if overwrite_left:
|
||||||
if right.status and len(right.status):
|
if right.status and len(right.status):
|
||||||
self.status = right.status
|
self.status = right.status
|
||||||
self.rearchivable |= right.rearchivable
|
|
||||||
for k, v in right.metadata.items():
|
for k, v in right.metadata.items():
|
||||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||||
@@ -44,6 +43,7 @@ class Metadata:
|
|||||||
|
|
||||||
def store(self: Metadata, override_storages: List = None):
|
def store(self: Metadata, override_storages: List = None):
|
||||||
# calls .store for all contained media. storages [Storage]
|
# calls .store for all contained media. storages [Storage]
|
||||||
|
self.remove_duplicate_media_by_hash()
|
||||||
storages = override_storages or ArchivingContext.get("storages")
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
for media in self.media:
|
for media in self.media:
|
||||||
media.store(override_storages=storages, url=self.get_url())
|
media.store(override_storages=storages, url=self.get_url())
|
||||||
@@ -124,6 +124,27 @@ class Metadata:
|
|||||||
if m.get("id") == id: return m
|
if m.get("id") == id: return m
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
def remove_duplicate_media_by_hash(self) -> None:
|
||||||
|
# iterates all media, calculates a hash if it's missing and deletes duplicates
|
||||||
|
def calculate_hash_in_chunks(hash_algo, chunksize, filename) -> str:
|
||||||
|
# taken from hash_enricher, cannot be isolated to misc due to circular imports
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
while True:
|
||||||
|
buf = f.read(chunksize)
|
||||||
|
if not buf: break
|
||||||
|
hash_algo.update(buf)
|
||||||
|
return hash_algo.hexdigest()
|
||||||
|
|
||||||
|
media_hashes = set()
|
||||||
|
new_media = []
|
||||||
|
for m in self.media:
|
||||||
|
h = m.get("hash")
|
||||||
|
if not h: h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||||
|
if len(h) and h in media_hashes: continue
|
||||||
|
media_hashes.add(h)
|
||||||
|
new_media.append(m)
|
||||||
|
self.media = new_media
|
||||||
|
|
||||||
def get_first_image(self, default=None) -> Media:
|
def get_first_image(self, default=None) -> Media:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if "image" in m.mimetype: return m
|
if "image" in m.mimetype: return m
|
||||||
|
|||||||
@@ -62,11 +62,7 @@ class ArchivingOrchestrator:
|
|||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
if original_url != url: result.set("original_url", original_url)
|
if original_url != url: result.set("original_url", original_url)
|
||||||
|
|
||||||
# 2 - rearchiving logic + notify start to DB
|
# 2 - notify start to DB
|
||||||
# archivers can signal whether the content is rearchivable: eg: tweet vs webpage
|
|
||||||
for a in self.archivers: result.rearchivable |= a.is_rearchivable(url)
|
|
||||||
logger.debug(f"{result.rearchivable=} for {url=}")
|
|
||||||
|
|
||||||
# signal to DB that archiving has started
|
# signal to DB that archiving has started
|
||||||
# and propagate already archived if it exists
|
# and propagate already archived if it exists
|
||||||
cached_result = None
|
cached_result = None
|
||||||
@@ -78,7 +74,7 @@ class ArchivingOrchestrator:
|
|||||||
d.started(result)
|
d.started(result)
|
||||||
if (local_result := d.fetch(result)):
|
if (local_result := d.fetch(result)):
|
||||||
cached_result = (cached_result or Metadata()).merge(local_result)
|
cached_result = (cached_result or Metadata()).merge(local_result)
|
||||||
if cached_result and not cached_result.rearchivable:
|
if cached_result:
|
||||||
logger.debug("Found previously archived entry")
|
logger.debug("Found previously archived entry")
|
||||||
for d in self.databases:
|
for d in self.databases:
|
||||||
d.done(cached_result)
|
d.done(cached_result)
|
||||||
@@ -109,7 +105,6 @@ class ArchivingOrchestrator:
|
|||||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||||
result.store()
|
result.store()
|
||||||
|
|
||||||
#TODO: remove any duplicate media, if hash is available
|
|
||||||
|
|
||||||
# 6 - format and store formatted if needed
|
# 6 - format and store formatted if needed
|
||||||
# enrichers typically need access to already stored URLs etc
|
# enrichers typically need access to already stored URLs etc
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from inspect import ClassFoundException
|
from inspect import ClassFoundException
|
||||||
from typing import Type
|
from typing import Type
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
# from collections.abc import Iterable
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ from .thumbnail_enricher import ThumbnailEnricher
|
|||||||
from .wacz_enricher import WaczArchiverEnricher
|
from .wacz_enricher import WaczArchiverEnricher
|
||||||
from .whisper_enricher import WhisperEnricher
|
from .whisper_enricher import WhisperEnricher
|
||||||
from .pdq_hash_enricher import PdqHashEnricher
|
from .pdq_hash_enricher import PdqHashEnricher
|
||||||
|
from .metadata_enricher import MetadataEnricher
|
||||||
@@ -23,7 +23,7 @@ class HashEnricher(Enricher):
|
|||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||||
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
@@ -34,7 +34,7 @@ class HashEnricher(Enricher):
|
|||||||
if len(hd := self.calculate_hash(m.filename)):
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||||
|
|
||||||
def calculate_hash(self, filename):
|
def calculate_hash(self, filename) -> str:
|
||||||
hash = None
|
hash = None
|
||||||
if self.algorithm == "SHA-256":
|
if self.algorithm == "SHA-256":
|
||||||
hash = hashlib.sha256()
|
hash = hashlib.sha256()
|
||||||
|
|||||||
47
src/auto_archiver/enrichers/metadata_enricher.py
Normal file
47
src/auto_archiver/enrichers/metadata_enricher.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
import subprocess
|
||||||
|
import traceback
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from . import Enricher
|
||||||
|
from ..core import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataEnricher(Enricher):
|
||||||
|
"""
|
||||||
|
Extracts metadata information from files using exiftool.
|
||||||
|
"""
|
||||||
|
name = "metadata_enricher"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
# without this STEP.__init__ is not called
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
|
url = to_enrich.get_url()
|
||||||
|
logger.debug(f"extracting EXIF metadata for {url=}")
|
||||||
|
|
||||||
|
for i, m in enumerate(to_enrich.media):
|
||||||
|
if len(md := self.get_metadata(m.filename)):
|
||||||
|
to_enrich.media[i].set("metadata", md)
|
||||||
|
|
||||||
|
def get_metadata(self, filename: str) -> dict:
|
||||||
|
try:
|
||||||
|
# Run ExifTool command to extract metadata from the file
|
||||||
|
cmd = ['exiftool', filename]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
|
# Process the output to extract individual metadata fields
|
||||||
|
metadata = {}
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
field, value = line.strip().split(':', 1)
|
||||||
|
metadata[field.strip()] = value.strip()
|
||||||
|
return metadata
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||||
|
return {}
|
||||||
@@ -29,8 +29,8 @@ class PdqHashEnricher(Enricher):
|
|||||||
|
|
||||||
for m in to_enrich.media:
|
for m in to_enrich.media:
|
||||||
for media in m.all_inner_media(True):
|
for media in m.all_inner_media(True):
|
||||||
if media.is_image() and media.get("id") != "screenshot" and len(hd := self.calculate_pdq_hash(media.filename)):
|
if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)):
|
||||||
media.set("pdq_hash", hd)
|
media.set("pdq_hash", hd)
|
||||||
|
|
||||||
def calculate_pdq_hash(self, filename):
|
def calculate_pdq_hash(self, filename):
|
||||||
# returns a hexadecimal string with the perceptual hash for the given filename
|
# returns a hexadecimal string with the perceptual hash for the given filename
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.warning(f"ENRICHING WACZ for {url=}")
|
|
||||||
|
|
||||||
collection = str(uuid.uuid4())[0:8]
|
collection = str(uuid.uuid4())[0:8]
|
||||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||||
@@ -58,6 +57,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
"--scopeType", "page",
|
"--scopeType", "page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text",
|
||||||
|
"--screenshot", "fullPage",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--id", collection,
|
"--id", collection,
|
||||||
"--saveState", "never",
|
"--saveState", "never",
|
||||||
@@ -80,6 +80,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
"--scopeType", "page",
|
"--scopeType", "page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text",
|
||||||
|
"--screenshot", "fullPage",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
"--behaviorTimeout", str(self.timeout),
|
"--behaviorTimeout", str(self.timeout),
|
||||||
@@ -136,14 +137,25 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
|
|
||||||
# get media out of .warc
|
# get media out of .warc
|
||||||
counter = 0
|
counter = 0
|
||||||
|
seen_urls = set()
|
||||||
with open(warc_filename, 'rb') as warc_stream:
|
with open(warc_filename, 'rb') as warc_stream:
|
||||||
for record in ArchiveIterator(warc_stream):
|
for record in ArchiveIterator(warc_stream):
|
||||||
# only include fetched resources
|
# only include fetched resources
|
||||||
|
if record.rec_type == "resource": # screenshots
|
||||||
|
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
||||||
|
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
||||||
|
m = Media(filename=fn)
|
||||||
|
to_enrich.add_media(m, "browsertrix-screenshot")
|
||||||
|
counter += 1
|
||||||
|
|
||||||
if record.rec_type != 'response': continue
|
if record.rec_type != 'response': continue
|
||||||
record_url = record.rec_headers.get_header('WARC-Target-URI')
|
record_url = record.rec_headers.get_header('WARC-Target-URI')
|
||||||
if not UrlUtil.is_relevant_url(record_url):
|
if not UrlUtil.is_relevant_url(record_url):
|
||||||
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
|
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
|
||||||
continue
|
continue
|
||||||
|
if record_url in seen_urls:
|
||||||
|
logger.debug(f"Skipping already seen URL {record_url}.")
|
||||||
|
continue
|
||||||
|
|
||||||
# filter by media mimetypes
|
# filter by media mimetypes
|
||||||
content_type = record.http_headers.get("Content-Type")
|
content_type = record.http_headers.get("Content-Type")
|
||||||
@@ -152,11 +164,26 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
|||||||
|
|
||||||
# create local file and add media
|
# create local file and add media
|
||||||
ext = mimetypes.guess_extension(content_type)
|
ext = mimetypes.guess_extension(content_type)
|
||||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}{ext}")
|
warc_fn = f"warc-file-{counter}{ext}"
|
||||||
|
fn = os.path.join(tmp_dir, warc_fn)
|
||||||
|
|
||||||
|
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
||||||
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
||||||
|
|
||||||
m = Media(filename=fn)
|
m = Media(filename=fn)
|
||||||
m.set("src", record_url)
|
m.set("src", record_url)
|
||||||
# TODO URLUTIL to ignore known-recurring media like favicons, profile pictures, etc.
|
# if a link with better quality exists, try to download that
|
||||||
to_enrich.add_media(m, f"browsertrix-media-{counter}")
|
if record_url_best_qual != record_url:
|
||||||
|
try:
|
||||||
|
m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
|
||||||
|
m.set("src", record_url_best_qual)
|
||||||
|
m.set("src_alternative", record_url)
|
||||||
|
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
|
||||||
|
|
||||||
|
# remove bad videos
|
||||||
|
if m.is_video() and not m.is_valid_video(): continue
|
||||||
|
|
||||||
|
to_enrich.add_media(m, warc_fn)
|
||||||
counter += 1
|
counter += 1
|
||||||
|
seen_urls.add(record_url)
|
||||||
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
|
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
|
||||||
|
|||||||
@@ -65,11 +65,12 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Disable grayscale on hover */
|
/* Disable grayscale on hover */
|
||||||
img:hover,
|
/* img:hover,
|
||||||
video:hover {
|
video:hover {
|
||||||
-webkit-filter: grayscale(0);
|
-webkit-filter: grayscale(0);
|
||||||
filter: none;
|
filter: none;
|
||||||
}
|
} */
|
||||||
|
|
||||||
|
|
||||||
.collapsible {
|
.collapsible {
|
||||||
background-color: #777;
|
background-color: #777;
|
||||||
@@ -101,57 +102,68 @@
|
|||||||
<body>
|
<body>
|
||||||
<div id="notification"></div>
|
<div id="notification"></div>
|
||||||
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
||||||
|
{% if title | string | length > 0 %}
|
||||||
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
|
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
|
||||||
|
{% endif %}
|
||||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||||
|
<form class="center">
|
||||||
|
<label>
|
||||||
|
<input type="checkbox" id="safe-media-view" checked>
|
||||||
|
Safe Media View
|
||||||
|
</label>
|
||||||
|
</form>
|
||||||
<table class="content">
|
<table class="content">
|
||||||
<tr>
|
<tr>
|
||||||
<th>about</th>
|
<th>about</th>
|
||||||
<th>preview(s)</th>
|
<th>preview(s)</th>
|
||||||
</tr>
|
</tr>
|
||||||
{% for m in media %}
|
<tbody>
|
||||||
<tr>
|
{% for m in media %}
|
||||||
<td>
|
<tr>
|
||||||
<ul>
|
<td>
|
||||||
<li><b>key:</b> <span class="copy">{{ m.key }}</span></li>
|
<ul>
|
||||||
<li><b>type:</b> <span class="copy">{{ m.mimetype }}</span></li>
|
<li><b>key:</b> <span class="copy">{{ m.key }}</span></li>
|
||||||
|
<li><b>type:</b> <span class="copy">{{ m.mimetype }}</span></li>
|
||||||
|
|
||||||
{% for prop in m.properties %}
|
{% for prop in m.properties %}
|
||||||
|
|
||||||
{% if m.properties[prop] | is_list %}
|
{% if m.properties[prop] | is_list %}
|
||||||
<p></p>
|
|
||||||
<div>
|
|
||||||
<b class="collapsible" title="expand">{{ prop }}:</b>
|
|
||||||
<p></p>
|
<p></p>
|
||||||
<div class="collapsible-content">
|
<div>
|
||||||
{% for subprop in m.properties[prop] %}
|
<b class="collapsible" title="expand">{{ prop }}:</b>
|
||||||
{% if subprop | is_media %}
|
<p></p>
|
||||||
{{ macros.display_media(subprop, true, url) }}
|
<div class="collapsible-content">
|
||||||
|
{% for subprop in m.properties[prop] %}
|
||||||
|
{% if subprop | is_media %}
|
||||||
|
{{ macros.display_media(subprop, true, url) }}
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
{% for subprop_prop in subprop.properties %}
|
{% for subprop_prop in subprop.properties %}
|
||||||
<li><b>{{ subprop_prop }}:</b> {{ macros.copy_urlize(subprop.properties[subprop_prop]) }}</li>
|
<li><b>{{ subprop_prop }}:</b>
|
||||||
|
{{ macros.copy_urlize(subprop.properties[subprop_prop]) }}</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
{% else %}
|
||||||
|
{{ subprop }}
|
||||||
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</ul>
|
</div>
|
||||||
|
|
||||||
{% else %}
|
|
||||||
{{ subprop }}
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
<p></p>
|
||||||
<p></p>
|
{% elif m.properties[prop] | string | length > 1 %}
|
||||||
{% elif m.properties[prop] | string | length > 1 %}
|
<li><b>{{ prop }}:</b> {{ macros.copy_urlize(m.properties[prop]) }}</li>
|
||||||
<li><b>{{ prop }}:</b> {{ macros.copy_urlize(m.properties[prop]) }}</li>
|
{% endif %}
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</ul>
|
</ul>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{{ macros.display_media(m, true, url) }}
|
{{ macros.display_media(m, true, url) }}
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
<h2 class="center">metadata</h2>
|
<h2 class="center">metadata</h2>
|
||||||
<table class="metadata">
|
<table class="metadata">
|
||||||
@@ -220,6 +232,49 @@
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// logic for enabled/disabled greyscale
|
||||||
|
// Get references to the checkboxes and images/videos
|
||||||
|
const safeImageViewCheckbox = document.getElementById('safe-media-view');
|
||||||
|
const imagesVideos = document.querySelectorAll('img, video');
|
||||||
|
|
||||||
|
// Function to toggle grayscale effect
|
||||||
|
function toggleGrayscale() {
|
||||||
|
imagesVideos.forEach(element => {
|
||||||
|
if (safeImageViewCheckbox.checked) {
|
||||||
|
// Enable grayscale effect
|
||||||
|
element.style.filter = 'grayscale(1)';
|
||||||
|
element.style.webkitFilter = 'grayscale(1)';
|
||||||
|
} else {
|
||||||
|
// Disable grayscale effect
|
||||||
|
element.style.filter = 'none';
|
||||||
|
element.style.webkitFilter = 'none';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add event listener to the checkbox to trigger the toggleGrayscale function
|
||||||
|
safeImageViewCheckbox.addEventListener('change', toggleGrayscale);
|
||||||
|
|
||||||
|
// Handle the hover effect using JavaScript
|
||||||
|
imagesVideos.forEach(element => {
|
||||||
|
element.addEventListener('mouseenter', () => {
|
||||||
|
// Disable grayscale effect on hover
|
||||||
|
element.style.filter = 'none';
|
||||||
|
element.style.webkitFilter = 'none';
|
||||||
|
});
|
||||||
|
|
||||||
|
element.addEventListener('mouseleave', () => {
|
||||||
|
// Re-enable grayscale effect if checkbox is checked
|
||||||
|
if (safeImageViewCheckbox.checked) {
|
||||||
|
element.style.filter = 'grayscale(1)';
|
||||||
|
element.style.webkitFilter = 'grayscale(1)';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Call the function on page load to apply the initial state
|
||||||
|
toggleGrayscale();
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
@@ -17,9 +17,6 @@ No URL available for {{ m.key }}.
|
|||||||
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,
|
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,
|
||||||
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,
|
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,
|
||||||
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,
|
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,
|
||||||
<a href="https://iqdb.org/?url={{ url | quote }}">IQDB</a>,
|
|
||||||
<a href="https://saucenao.com/search.php?db=999&url={{ url | quote }}">SauceNAO</a>,
|
|
||||||
<a href="https://imgops.com/{{ url | quote }}">IMGOPS</a>
|
|
||||||
</div>
|
</div>
|
||||||
<p></p>
|
<p></p>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ def expand_url(url):
|
|||||||
logger.error(f'Failed to expand url {url}')
|
logger.error(f'Failed to expand url {url}')
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def getattr_or(o: object, prop: str, default=None):
|
def getattr_or(o: object, prop: str, default=None):
|
||||||
try:
|
try:
|
||||||
res = getattr(o, prop)
|
res = getattr(o, prop)
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
|
|
||||||
class UrlUtil:
|
class UrlUtil:
|
||||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||||
@@ -43,4 +42,22 @@ class UrlUtil:
|
|||||||
|
|
||||||
# twitter profile pictures
|
# twitter profile pictures
|
||||||
if "twimg.com/profile_images" in url: return False
|
if "twimg.com/profile_images" in url: return False
|
||||||
|
if "twimg.com" in url and "/default_profile_images" in url: return False
|
||||||
|
|
||||||
|
# instagram profile pictures
|
||||||
|
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
|
||||||
|
# instagram recurring images
|
||||||
|
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
|
||||||
|
|
||||||
|
# telegram
|
||||||
|
if "https://telegram.org/img/emoji/" in url: return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def twitter_best_quality_url(url: str) -> str:
|
||||||
|
"""
|
||||||
|
some twitter image URLs point to a less-than best quality
|
||||||
|
this returns the URL pointing to the highest (original) quality
|
||||||
|
"""
|
||||||
|
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "6"
|
_MINOR = "6"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "0"
|
_PATCH = "2"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user