mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
improving ignored content from waczs
This commit is contained in:
@@ -1,10 +1,15 @@
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import traceback
|
||||
from typing import Any, List
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
import ffmpeg
|
||||
from ffmpeg._run import Error
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from loguru import logger
|
||||
@@ -74,6 +79,23 @@ class Media:
|
||||
|
||||
def is_audio(self) -> bool:
|
||||
return self.mimetype.startswith("audio")
|
||||
|
||||
|
||||
def is_image(self) -> bool:
|
||||
return self.mimetype.startswith("image")
|
||||
|
||||
def is_valid_video(self) -> bool:
|
||||
# checks for video streams with ffmpeg, or min file size for a video
|
||||
# self.is_video() should be used together with this method
|
||||
try:
|
||||
streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
|
||||
logger.warning(f"STREAMS FOR {self.filename} {streams}")
|
||||
return any(s.get("duration_ts") > 0 for s in streams)
|
||||
except Error: return False # ffmpeg errors when reading bad files
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
fsize = os.path.getsize(self.filename)
|
||||
return fsize > 20_000
|
||||
except: pass
|
||||
return True
|
||||
|
||||
@@ -180,6 +180,9 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
m.set("src_alternative", record_url)
|
||||
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
|
||||
|
||||
# remove bad videos
|
||||
if m.is_video() and not m.is_valid_video(): continue
|
||||
|
||||
to_enrich.add_media(m, warc_fn)
|
||||
counter += 1
|
||||
seen_urls.add(record_url)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
@@ -44,6 +43,12 @@ class UrlUtil:
|
||||
# twitter profile pictures
|
||||
if "twimg.com/profile_images" in url: return False
|
||||
if "twimg.com" in url and "/default_profile_images" in url: return False
|
||||
|
||||
# instagram profile pictures
|
||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
|
||||
# instagram recurring images
|
||||
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
|
||||
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
|
||||
Reference in New Issue
Block a user