improving ignored content from waczs

2026-06-08 03:18:28 +03:00 · 2023-07-28 12:19:14 +01:00
parent 7a5c9c65bd
commit aa71c85a98
3 changed files with 32 additions and 2 deletions
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -1,10 +1,15 @@

 from __future__ import annotations
+import os
+import traceback
 from typing import Any, List
 from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json, config
 import mimetypes

+import ffmpeg
+from ffmpeg._run import Error
+
 from .context import ArchivingContext

 from loguru import logger
@@ -74,6 +79,23 @@ class Media:

    def is_audio(self) -> bool:
        return self.mimetype.startswith("audio")
-    
+
    def is_image(self) -> bool:
        return self.mimetype.startswith("image")
+
+    def is_valid_video(self) -> bool:
+        # checks for video streams with ffmpeg, or min file size for a video
+        # self.is_video() should be used together with this method
+        try:
+            streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
+            logger.warning(f"STREAMS FOR {self.filename} {streams}")
+            return any(s.get("duration_ts") > 0 for s in streams)
+        except Error: return False # ffmpeg errors when reading bad files
+        except Exception as e:
+            logger.error(e)
+            logger.error(traceback.format_exc())
+            try:
+                fsize = os.path.getsize(self.filename)
+                return fsize > 20_000
+            except: pass
+        return True
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -180,6 +180,9 @@ class WaczArchiverEnricher(Enricher, Archiver):
                        m.set("src_alternative", record_url)
                    except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")

+                # remove bad videos
+                if m.is_video() and not m.is_valid_video(): continue
+                
                to_enrich.add_media(m, warc_fn)
                counter += 1
                seen_urls.add(record_url)
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@@ -1,7 +1,6 @@
 import re
 from urllib.parse import urlparse, urlunparse

-
 class UrlUtil:
    telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
    is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
@@ -44,6 +43,12 @@ class UrlUtil:
        # twitter profile pictures
        if "twimg.com/profile_images" in url: return False
        if "twimg.com" in url and "/default_profile_images" in url: return False
+
+        # instagram profile pictures
+        if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
+        # instagram recurring images
+        if "https://static.cdninstagram.com/rsrc.php/" in url: return False
+
        return True

    @staticmethod