diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 2c8ac28..f3e0b71 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -1,10 +1,15 @@ from __future__ import annotations +import os +import traceback from typing import Any, List from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes +import ffmpeg +from ffmpeg._run import Error + from .context import ArchivingContext from loguru import logger @@ -74,6 +79,23 @@ class Media: def is_audio(self) -> bool: return self.mimetype.startswith("audio") - + def is_image(self) -> bool: return self.mimetype.startswith("image") + + def is_valid_video(self) -> bool: + # checks for video streams with ffmpeg, or min file size for a video + # self.is_video() should be used together with this method + try: + streams = ffmpeg.probe(self.filename, select_streams='v')['streams'] + logger.warning(f"STREAMS FOR {self.filename} {streams}") + return any(s.get("duration_ts") > 0 for s in streams) + except Error: return False # ffmpeg errors when reading bad files + except Exception as e: + logger.error(e) + logger.error(traceback.format_exc()) + try: + fsize = os.path.getsize(self.filename) + return fsize > 20_000 + except: pass + return True diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index e401de6..b4eeefb 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -180,6 +180,9 @@ class WaczArchiverEnricher(Enricher, Archiver): m.set("src_alternative", record_url) except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.") + # remove bad videos + if m.is_video() and not m.is_valid_video(): continue + to_enrich.add_media(m, warc_fn) counter += 1 seen_urls.add(record_url) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 9f27ef6..644e5d0 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -1,7 +1,6 @@ import re from urllib.parse import urlparse, urlunparse - class UrlUtil: telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)") is_istagram = re.compile(r"https:\/\/www\.instagram\.com") @@ -44,6 +43,12 @@ class UrlUtil: # twitter profile pictures if "twimg.com/profile_images" in url: return False if "twimg.com" in url and "/default_profile_images" in url: return False + + # instagram profile pictures + if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False + # instagram recurring images + if "https://static.cdninstagram.com/rsrc.php/" in url: return False + return True @staticmethod