Merge pull request #407 from bellingcat/dev

minor bug fix: handles failed get downloads
2026-06-07 19:08:30 +03:00 · 2026-03-02 17:10:46 +00:00
parent 5d6c5ac2b1 23a88e3cf4
commit 63cfe34e23
15 changed files with 335 additions and 22 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "auto-archiver"
-version = "1.2.3"
+version = "1.2.4"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
 requires-python = ">=3.10,<3.13"
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -181,6 +181,9 @@ class Metadata:
        media_hashes = set()
        new_media = []
        for m in self.media:
            if not m.filename:
                new_media.append(m)
                continue
            h = m.get("hash")
            if not h:
                h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@@ -73,6 +73,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
        if self.enrich(result):
            result.status = "antibot"
            return result
        return False
    def _prepare_user_data_dir(self):
        if self.user_data_dir:
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -39,12 +39,18 @@ class Bluesky(GenericDropin):
        media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
        for image_media in image_medias:
            url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
-            image_media = archiver.download_from_url(url)
+            filename = archiver.download_from_url(url)
-            media.append(Media(image_media))
+            if filename:
                media.append(Media(filename))
            else:
                logger.warning(f"Failed to download Bluesky image from {url}")
        for video_media in video_medias:
            url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
-            video_media = archiver.download_from_url(url)
+            filename = archiver.download_from_url(url)
-            media.append(Media(video_media))
+            if filename:
                media.append(Media(filename))
            else:
                logger.warning(f"Failed to download Bluesky video from {url}")
        return media
    def _get_post_data(self, post: dict) -> dict:
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -204,8 +204,11 @@ class GenericExtractor(Extractor):
        if thumbnail_url:
            try:
                cover_image_path = self.download_from_url(thumbnail_url)
-                media = Media(cover_image_path)
+                if cover_image_path:
-                metadata.add_media(media, id="cover")
+                    media = Media(cover_image_path)
                    metadata.add_media(media, id="cover")
                else:
                    logger.warning(f"Failed to download cover image from {thumbnail_url}")
            except Exception as e:
                logger.error(f"Could not download cover image {thumbnail_url}: {e}")
--- a/src/auto_archiver/modules/generic_extractor/truth.py
+++ b/src/auto_archiver/modules/generic_extractor/truth.py
@@ -1,6 +1,7 @@
 from typing import Type
 from auto_archiver.utils import traverse_obj
 from auto_archiver.utils.custom_logger import logger
 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.core.extractor import Extractor
 from yt_dlp.extractor.common import InfoExtractor
@@ -58,6 +59,9 @@ class Truth(GenericDropin):
        # add the media
        for media in post.get("media_attachments", []):
            filename = archiver.download_from_url(media["url"])
            if not filename:
                logger.warning(f"Failed to download media from {media['url']}")
                continue
            result.add_media(Media(filename), id=media.get("id"))
        return result
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -157,5 +157,8 @@ class Twitter(GenericDropin):
                mimetype = variant["content_type"]
            ext = mimetypes.guess_extension(mimetype)
            media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
            if not media.filename:
                logger.warning(f"Failed to download media from {media.get('src')}")
                continue
            result.add_media(media)
        return result
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@@ -25,6 +25,9 @@ class HashEnricher(Enricher):
        logger.debug(f"Calculating media hashes with algo={self.algorithm}")
        for i, m in enumerate(to_enrich.media):
            if not m.filename:
                logger.warning(f"Skipping hash for media without filename: {m}")
                continue
            if len(hd := self.calculate_hash(m.filename)):
                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor):
        result.set_title(user.get("full_name", username)).set("data", user)
        if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
            filename = self.download_from_url(pic_url)
-            result.add_media(Media(filename=filename), id="profile_picture")
+            if filename:
                result.add_media(Media(filename=filename), id="profile_picture")
            else:
                logger.warning(f"Failed to download profile picture from {pic_url}")
        count_posts = 0
        if self.full_profile:
@@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor):
        if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
            filename = self.download_from_url(cover_media)
-            result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
+            if filename:
                result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
            else:
                logger.warning(f"Failed to download cover media from {cover_media}")
        items = h_info.get("items", [])[::-1]  # newest to oldest
        items = items[: min(max_to_download, len(items))]
@@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor):
        image_media = None
        if image_url := item.get("thumbnail_url"):
            filename = self.download_from_url(image_url, verbose=False)
-            image_media = Media(filename=filename)
+            if filename:
                image_media = Media(filename=filename)
            else:
                logger.warning(f"Failed to download thumbnail from {image_url}")
        # retrieve video info
        best_id = item.get("id", item.get("pk"))
@@ -357,16 +366,19 @@ class InstagramAPIExtractor(Extractor):
        if video_url := item.get("video_url"):
            filename = self.download_from_url(video_url, verbose=False)
-            video_media = Media(filename=filename)
+            if filename:
-            if taken_at:
+                video_media = Media(filename=filename)
-                video_media.set("date", taken_at)
+                if taken_at:
-            if code:
+                    video_media.set("date", taken_at)
-                video_media.set("url", f"https://www.instagram.com/p/{code}")
+                if code:
-            if caption_text:
+                    video_media.set("url", f"https://www.instagram.com/p/{code}")
-                video_media.set("text", caption_text)
+                if caption_text:
-            video_media.set("preview", [image_media])
+                    video_media.set("text", caption_text)
-            video_media.set("data", [item])
+                video_media.set("preview", [image_media])
-            return item, video_media, f"{context or 'video'} {best_id}"
+                video_media.set("data", [item])
                return item, video_media, f"{context or 'video'} {best_id}"
            else:
                logger.warning(f"Failed to download video from {video_url}")
        elif image_media:
            if taken_at:
                image_media.set("date", taken_at)
--- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py
+++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py
@@ -25,6 +25,9 @@ class MetaEnricher(Enricher):
        logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
        total_size = 0
        for media in to_enrich.get_all_media():
            if not media.filename:
                logger.warning(f"Skipping file size for media without filename: {media}")
                continue
            file_stats = os.stat(media.filename)
            media.set("bytes", file_stats.st_size)
            media.set("size", self.human_readable_bytes(file_stats.st_size))
--- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
+++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
@@ -49,10 +49,18 @@ class TelegramExtractor(Extractor):
            if not len(image_urls):
                return False
            for img_url in image_urls:
-                result.add_media(Media(self.download_from_url(img_url)))
+                filename = self.download_from_url(img_url)
                if not filename:
                    logger.warning(f"Failed to download image from {img_url}")
                    continue
                result.add_media(Media(filename))
        else:
            video_url = video.get("src")
-            m_video = Media(self.download_from_url(video_url))
+            video_filename = self.download_from_url(video_url)
            if not video_filename:
                logger.warning(f"Failed to download video from {video_url}")
                return False
            m_video = Media(video_filename)
            # extract duration from HTML
            try:
                duration = s.find_all("time")[0].contents[0]
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -190,6 +190,9 @@ class TelethonExtractor(Extractor):
                            )
                        for i, om_url in enumerate(other_media_urls):
                            filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
                            if not filename:
                                logger.warning(f"Failed to download media from {om_url}")
                                continue
                            result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
                    filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor):
                logger.info(f"Found media {media}")
                ext = mimetypes.guess_extension(mimetype)
                media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
                if not media.filename:
                    logger.warning(f"Failed to download media from {media.get('src')}")
                    continue
                result.add_media(media)
        result.set_content(
--- a/tests/extractors/test_antibot_extractor_enricher.py
+++ b/tests/extractors/test_antibot_extractor_enricher.py
@@ -53,6 +53,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
    }
    @pytest.mark.download
    @pytest.mark.flaky(reruns=2, reruns_delay=5)
    @pytest.mark.parametrize(
        "url,in_title,in_text,image_count,video_count,skip_ci",
        [
@@ -128,6 +129,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
        item = make_item(url)
        result = self.extractor.download(item)
        assert result, f"download() returned {result!r} — Selenium may have failed (e.g., window close timeout)"
        assert result.status == "antibot", "Expected status to be 'antibot'"
        # Check title contains all required words (case-insensitive)
--- a/tests/test_none_filename_handling.py
+++ b/tests/test_none_filename_handling.py
@@ -0,0 +1,259 @@
 """
 Tests for handling Media objects with None filename.
 When download_from_url fails, it returns None. Various enrichers and
 the metadata deduplication logic must gracefully handle Media objects
 where filename is None, rather than crashing with TypeError.
 """
 from datetime import datetime, timezone
 from unittest.mock import MagicMock
 import pytest
 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.modules.meta_enricher import MetaEnricher
 # ── HashEnricher ──────────────────────────────────────────────────────
 class TestHashEnricherNoneFilename:
    """hash_enricher should skip media with None filename without crashing."""
    @pytest.fixture(autouse=True)
    def setup(self, setup_module):
        self.enricher = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 100})
    def test_skips_none_filename(self):
        m = Metadata().set_url("https://example.com")
        media = Media(filename=None)
        media.set("src", "https://example.com/video.mp4")
        m.add_media(media)
        # Should not raise
        self.enricher.enrich(m)
        # No hash should be set
        assert m.media[0].get("hash") is None
    def test_hashes_valid_skips_none(self, tmp_path):
        """Mix of valid and None-filename media: only valid ones get hashed."""
        valid_file = tmp_path / "test.txt"
        valid_file.write_text("hello world")
        m = Metadata().set_url("https://example.com")
        m.add_media(Media(filename=str(valid_file)))
        m.add_media(Media(filename=None))
        self.enricher.enrich(m)
        assert m.media[0].get("hash") is not None
        assert m.media[1].get("hash") is None
    def test_all_none_filenames(self):
        """All media have None filename – enricher should not crash."""
        m = Metadata().set_url("https://example.com")
        m.add_media(Media(filename=None))
        m.add_media(Media(filename=None))
        self.enricher.enrich(m)
        assert len(m.media) == 2
        for media in m.media:
            assert media.get("hash") is None
 # ── MetaEnricher ──────────────────────────────────────────────────────
 class TestMetaEnricherNoneFilename:
    """meta_enricher should skip media with None filename without crashing."""
    @pytest.fixture(autouse=True)
    def setup(self, setup_module):
        self.enricher = setup_module(MetaEnricher, {})
    def test_skips_none_filename(self):
        m = Metadata().set_url("https://example.com")
        m.set("_processed_at", datetime.now(timezone.utc))
        media = Media(filename=None)
        media.set("src", "https://example.com/video.mp4")
        m.add_media(media)
        # Should not raise
        self.enricher.enrich(m)
        assert m.get("total_bytes") == 0
    def test_sizes_valid_skips_none(self, tmp_path):
        """Mix of valid and None-filename media: only valid ones get sized."""
        valid_file = tmp_path / "test.txt"
        valid_file.write_text("A" * 500)
        m = Metadata().set_url("https://example.com")
        m.set("_processed_at", datetime.now(timezone.utc))
        m.add_media(Media(filename=str(valid_file)))
        m.add_media(Media(filename=None))
        self.enricher.enrich(m)
        assert m.media[0].get("bytes") == 500
        assert m.media[1].get("bytes") is None
        assert m.get("total_bytes") == 500
 # ── Metadata.remove_duplicate_media_by_hash ───────────────────────────
 class TestRemoveDuplicateMediaNoneFilename:
    """remove_duplicate_media_by_hash should keep media with None filename."""
    def test_none_filename_kept(self):
        m = Metadata().set_url("https://example.com")
        none_media = Media(filename=None)
        none_media.set("src", "https://example.com/video.mp4")
        m.add_media(none_media)
        m.remove_duplicate_media_by_hash()
        assert len(m.media) == 1
        assert m.media[0].filename is None
    def test_none_and_valid_mixed(self, tmp_path):
        """None-filename media is kept alongside valid-filename media."""
        valid_file = tmp_path / "test.txt"
        valid_file.write_text("content")
        m = Metadata().set_url("https://example.com")
        m.add_media(Media(filename=str(valid_file)))
        none_media = Media(filename=None)
        none_media.set("src", "https://example.com/video.mp4")
        m.add_media(none_media)
        m.remove_duplicate_media_by_hash()
        assert len(m.media) == 2
    def test_multiple_none_filename_all_kept(self):
        """Multiple None-filename media are all kept (can't deduplicate without file)."""
        m = Metadata().set_url("https://example.com")
        m.add_media(Media(filename=None))
        m.add_media(Media(filename=None))
        m.remove_duplicate_media_by_hash()
        assert len(m.media) == 2
 # ── Twitter dropin create_metadata ────────────────────────────────────
 class TestTwitterDropinNoneFilename:
    """Twitter dropin should skip media when download_from_url returns None."""
    @pytest.fixture
    def twitter_dropin(self):
        from auto_archiver.modules.generic_extractor.twitter import Twitter
        return Twitter()
    def test_create_metadata_skips_failed_photo_download(self, twitter_dropin):
        """When download_from_url returns None for a photo, it's not added to media."""
        tweet = {
            "user": {"name": "Test User"},
            "created_at": "Sun Feb 08 18:45:00 +0000 2026",
            "full_text": "Test tweet with photo",
            "entities": {
                "media": [
                    {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
                ]
            },
        }
        mock_archiver = MagicMock()
        mock_archiver.download_from_url.return_value = None  # simulate failed download
        result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
        # The result should have no media since the download failed
        assert len(result.media) == 0
    def test_create_metadata_skips_failed_video_download(self, twitter_dropin):
        """When download_from_url returns None for a video, it's not added to media."""
        tweet = {
            "user": {"name": "Test User"},
            "created_at": "Sun Feb 08 18:45:00 +0000 2026",
            "full_text": "Test tweet with video",
            "entities": {
                "media": [
                    {
                        "type": "video",
                        "video_info": {
                            "variants": [
                                {
                                    "url": "https://video.twimg.com/vid/1280x720/test.mp4",
                                    "content_type": "video/mp4",
                                },
                            ]
                        },
                    },
                ]
            },
        }
        mock_archiver = MagicMock()
        mock_archiver.download_from_url.return_value = None
        result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
        assert len(result.media) == 0
    def test_create_metadata_keeps_successful_download(self, twitter_dropin, tmp_path):
        """When download_from_url succeeds, media is added."""
        tweet = {
            "user": {"name": "Test User"},
            "created_at": "Sun Feb 08 18:45:00 +0000 2026",
            "full_text": "Test tweet with photo",
            "entities": {
                "media": [
                    {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
                ]
            },
        }
        test_file = tmp_path / "test.jpg"
        test_file.write_text("fake image data")
        mock_archiver = MagicMock()
        mock_archiver.download_from_url.return_value = str(test_file)
        result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
        assert len(result.media) == 1
        assert result.media[0].filename == str(test_file)
    def test_create_metadata_mixed_downloads(self, twitter_dropin, tmp_path):
        """One download succeeds, one fails – only successful one is kept."""
        tweet = {
            "user": {"name": "Test User"},
            "created_at": "Sun Feb 08 18:45:00 +0000 2026",
            "full_text": "Test tweet with two photos",
            "entities": {
                "media": [
                    {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test1.jpg"},
                    {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test2.jpg"},
                ]
            },
        }
        test_file = tmp_path / "test1.jpg"
        test_file.write_text("fake image data")
        mock_archiver = MagicMock()
        # First call succeeds, second fails
        mock_archiver.download_from_url.side_effect = [str(test_file), None]
        result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
        assert len(result.media) == 1
        assert result.media[0].filename == str(test_file)