diff --git a/pyproject.toml b/pyproject.toml index 3ff38ad..08e1bf6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "1.2.3" +version = "1.2.4" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index f1ac3c0..05f7d9c 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -181,6 +181,9 @@ class Metadata: media_hashes = set() new_media = [] for m in self.media: + if not m.filename: + new_media.append(m) + continue h = m.get("hash") if not h: h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename) diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 26a1103..15c70e3 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -73,6 +73,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): if self.enrich(result): result.status = "antibot" return result + return False def _prepare_user_data_dir(self): if self.user_data_dir: diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index a4357ca..cece8fd 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -39,12 +39,18 @@ class Bluesky(GenericDropin): media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}" for image_media in image_medias: url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"]) - image_media = archiver.download_from_url(url) - media.append(Media(image_media)) + filename = archiver.download_from_url(url) + if filename: + media.append(Media(filename)) + else: + logger.warning(f"Failed to download Bluesky image from {url}") for video_media in video_medias: url = media_url.format(video_media["ref"]["$link"], post["author"]["did"]) - video_media = archiver.download_from_url(url) - media.append(Media(video_media)) + filename = archiver.download_from_url(url) + if filename: + media.append(Media(filename)) + else: + logger.warning(f"Failed to download Bluesky video from {url}") return media def _get_post_data(self, post: dict) -> dict: diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 73076c2..d3a2f1b 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -204,8 +204,11 @@ class GenericExtractor(Extractor): if thumbnail_url: try: cover_image_path = self.download_from_url(thumbnail_url) - media = Media(cover_image_path) - metadata.add_media(media, id="cover") + if cover_image_path: + media = Media(cover_image_path) + metadata.add_media(media, id="cover") + else: + logger.warning(f"Failed to download cover image from {thumbnail_url}") except Exception as e: logger.error(f"Could not download cover image {thumbnail_url}: {e}") diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index 345f1cd..830acb4 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -1,6 +1,7 @@ from typing import Type from auto_archiver.utils import traverse_obj +from auto_archiver.utils.custom_logger import logger from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor @@ -58,6 +59,9 @@ class Truth(GenericDropin): # add the media for media in post.get("media_attachments", []): filename = archiver.download_from_url(media["url"]) + if not filename: + logger.warning(f"Failed to download media from {media['url']}") + continue result.add_media(Media(filename), id=media.get("id")) return result diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index a93f363..58aff48 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -157,5 +157,8 @@ class Twitter(GenericDropin): mimetype = variant["content_type"] ext = mimetypes.guess_extension(mimetype) media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}") + if not media.filename: + logger.warning(f"Failed to download media from {media.get('src')}") + continue result.add_media(media) return result diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 526bf9a..c5f0c66 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -25,6 +25,9 @@ class HashEnricher(Enricher): logger.debug(f"Calculating media hashes with algo={self.algorithm}") for i, m in enumerate(to_enrich.media): + if not m.filename: + logger.warning(f"Skipping hash for media without filename: {m}") + continue if len(hd := self.calculate_hash(m.filename)): to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}") diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 89337db..bdf32f6 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor): result.set_title(user.get("full_name", username)).set("data", user) if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")): filename = self.download_from_url(pic_url) - result.add_media(Media(filename=filename), id="profile_picture") + if filename: + result.add_media(Media(filename=filename), id="profile_picture") + else: + logger.warning(f"Failed to download profile picture from {pic_url}") count_posts = 0 if self.full_profile: @@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor): if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"): filename = self.download_from_url(cover_media) - result.add_media(Media(filename=filename), id=f"cover_media highlight {id}") + if filename: + result.add_media(Media(filename=filename), id=f"cover_media highlight {id}") + else: + logger.warning(f"Failed to download cover media from {cover_media}") items = h_info.get("items", [])[::-1] # newest to oldest items = items[: min(max_to_download, len(items))] @@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor): image_media = None if image_url := item.get("thumbnail_url"): filename = self.download_from_url(image_url, verbose=False) - image_media = Media(filename=filename) + if filename: + image_media = Media(filename=filename) + else: + logger.warning(f"Failed to download thumbnail from {image_url}") # retrieve video info best_id = item.get("id", item.get("pk")) @@ -357,16 +366,19 @@ class InstagramAPIExtractor(Extractor): if video_url := item.get("video_url"): filename = self.download_from_url(video_url, verbose=False) - video_media = Media(filename=filename) - if taken_at: - video_media.set("date", taken_at) - if code: - video_media.set("url", f"https://www.instagram.com/p/{code}") - if caption_text: - video_media.set("text", caption_text) - video_media.set("preview", [image_media]) - video_media.set("data", [item]) - return item, video_media, f"{context or 'video'} {best_id}" + if filename: + video_media = Media(filename=filename) + if taken_at: + video_media.set("date", taken_at) + if code: + video_media.set("url", f"https://www.instagram.com/p/{code}") + if caption_text: + video_media.set("text", caption_text) + video_media.set("preview", [image_media]) + video_media.set("data", [item]) + return item, video_media, f"{context or 'video'} {best_id}" + else: + logger.warning(f"Failed to download video from {video_url}") elif image_media: if taken_at: image_media.set("date", taken_at) diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index fd50291..8c0c460 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -25,6 +25,9 @@ class MetaEnricher(Enricher): logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files") total_size = 0 for media in to_enrich.get_all_media(): + if not media.filename: + logger.warning(f"Skipping file size for media without filename: {media}") + continue file_stats = os.stat(media.filename) media.set("bytes", file_stats.st_size) media.set("size", self.human_readable_bytes(file_stats.st_size)) diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index 03e4be5..9c83610 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -49,10 +49,18 @@ class TelegramExtractor(Extractor): if not len(image_urls): return False for img_url in image_urls: - result.add_media(Media(self.download_from_url(img_url))) + filename = self.download_from_url(img_url) + if not filename: + logger.warning(f"Failed to download image from {img_url}") + continue + result.add_media(Media(filename)) else: video_url = video.get("src") - m_video = Media(self.download_from_url(video_url)) + video_filename = self.download_from_url(video_url) + if not video_filename: + logger.warning(f"Failed to download video from {video_url}") + return False + m_video = Media(video_filename) # extract duration from HTML try: duration = s.find_all("time")[0].contents[0] diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index aa3afb7..84d1a5b 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -190,6 +190,9 @@ class TelethonExtractor(Extractor): ) for i, om_url in enumerate(other_media_urls): filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}") + if not filename: + logger.warning(f"Failed to download media from {om_url}") + continue result.add_media(Media(filename=filename), id=f"{group_id}_{i}") filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id)) diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 420008c..50a1c84 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor): logger.info(f"Found media {media}") ext = mimetypes.guess_extension(mimetype) media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}") + if not media.filename: + logger.warning(f"Failed to download media from {media.get('src')}") + continue result.add_media(media) result.set_content( diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py index 9becfe9..7663ba0 100644 --- a/tests/extractors/test_antibot_extractor_enricher.py +++ b/tests/extractors/test_antibot_extractor_enricher.py @@ -53,6 +53,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): } @pytest.mark.download + @pytest.mark.flaky(reruns=2, reruns_delay=5) @pytest.mark.parametrize( "url,in_title,in_text,image_count,video_count,skip_ci", [ @@ -128,6 +129,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase): item = make_item(url) result = self.extractor.download(item) + assert result, f"download() returned {result!r} — Selenium may have failed (e.g., window close timeout)" assert result.status == "antibot", "Expected status to be 'antibot'" # Check title contains all required words (case-insensitive) diff --git a/tests/test_none_filename_handling.py b/tests/test_none_filename_handling.py new file mode 100644 index 0000000..3299682 --- /dev/null +++ b/tests/test_none_filename_handling.py @@ -0,0 +1,259 @@ +""" +Tests for handling Media objects with None filename. + +When download_from_url fails, it returns None. Various enrichers and +the metadata deduplication logic must gracefully handle Media objects +where filename is None, rather than crashing with TypeError. +""" + +from datetime import datetime, timezone +from unittest.mock import MagicMock + +import pytest + +from auto_archiver.core.metadata import Metadata, Media +from auto_archiver.modules.hash_enricher import HashEnricher +from auto_archiver.modules.meta_enricher import MetaEnricher + + +# ── HashEnricher ────────────────────────────────────────────────────── + + +class TestHashEnricherNoneFilename: + """hash_enricher should skip media with None filename without crashing.""" + + @pytest.fixture(autouse=True) + def setup(self, setup_module): + self.enricher = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 100}) + + def test_skips_none_filename(self): + m = Metadata().set_url("https://example.com") + media = Media(filename=None) + media.set("src", "https://example.com/video.mp4") + m.add_media(media) + + # Should not raise + self.enricher.enrich(m) + # No hash should be set + assert m.media[0].get("hash") is None + + def test_hashes_valid_skips_none(self, tmp_path): + """Mix of valid and None-filename media: only valid ones get hashed.""" + valid_file = tmp_path / "test.txt" + valid_file.write_text("hello world") + + m = Metadata().set_url("https://example.com") + m.add_media(Media(filename=str(valid_file))) + m.add_media(Media(filename=None)) + + self.enricher.enrich(m) + + assert m.media[0].get("hash") is not None + assert m.media[1].get("hash") is None + + def test_all_none_filenames(self): + """All media have None filename – enricher should not crash.""" + m = Metadata().set_url("https://example.com") + m.add_media(Media(filename=None)) + m.add_media(Media(filename=None)) + + self.enricher.enrich(m) + + assert len(m.media) == 2 + for media in m.media: + assert media.get("hash") is None + + +# ── MetaEnricher ────────────────────────────────────────────────────── + + +class TestMetaEnricherNoneFilename: + """meta_enricher should skip media with None filename without crashing.""" + + @pytest.fixture(autouse=True) + def setup(self, setup_module): + self.enricher = setup_module(MetaEnricher, {}) + + def test_skips_none_filename(self): + m = Metadata().set_url("https://example.com") + m.set("_processed_at", datetime.now(timezone.utc)) + media = Media(filename=None) + media.set("src", "https://example.com/video.mp4") + m.add_media(media) + + # Should not raise + self.enricher.enrich(m) + assert m.get("total_bytes") == 0 + + def test_sizes_valid_skips_none(self, tmp_path): + """Mix of valid and None-filename media: only valid ones get sized.""" + valid_file = tmp_path / "test.txt" + valid_file.write_text("A" * 500) + + m = Metadata().set_url("https://example.com") + m.set("_processed_at", datetime.now(timezone.utc)) + m.add_media(Media(filename=str(valid_file))) + m.add_media(Media(filename=None)) + + self.enricher.enrich(m) + + assert m.media[0].get("bytes") == 500 + assert m.media[1].get("bytes") is None + assert m.get("total_bytes") == 500 + + +# ── Metadata.remove_duplicate_media_by_hash ─────────────────────────── + + +class TestRemoveDuplicateMediaNoneFilename: + """remove_duplicate_media_by_hash should keep media with None filename.""" + + def test_none_filename_kept(self): + m = Metadata().set_url("https://example.com") + none_media = Media(filename=None) + none_media.set("src", "https://example.com/video.mp4") + m.add_media(none_media) + + m.remove_duplicate_media_by_hash() + + assert len(m.media) == 1 + assert m.media[0].filename is None + + def test_none_and_valid_mixed(self, tmp_path): + """None-filename media is kept alongside valid-filename media.""" + valid_file = tmp_path / "test.txt" + valid_file.write_text("content") + + m = Metadata().set_url("https://example.com") + m.add_media(Media(filename=str(valid_file))) + none_media = Media(filename=None) + none_media.set("src", "https://example.com/video.mp4") + m.add_media(none_media) + + m.remove_duplicate_media_by_hash() + + assert len(m.media) == 2 + + def test_multiple_none_filename_all_kept(self): + """Multiple None-filename media are all kept (can't deduplicate without file).""" + m = Metadata().set_url("https://example.com") + m.add_media(Media(filename=None)) + m.add_media(Media(filename=None)) + + m.remove_duplicate_media_by_hash() + + assert len(m.media) == 2 + + +# ── Twitter dropin create_metadata ──────────────────────────────────── + + +class TestTwitterDropinNoneFilename: + """Twitter dropin should skip media when download_from_url returns None.""" + + @pytest.fixture + def twitter_dropin(self): + from auto_archiver.modules.generic_extractor.twitter import Twitter + + return Twitter() + + def test_create_metadata_skips_failed_photo_download(self, twitter_dropin): + """When download_from_url returns None for a photo, it's not added to media.""" + tweet = { + "user": {"name": "Test User"}, + "created_at": "Sun Feb 08 18:45:00 +0000 2026", + "full_text": "Test tweet with photo", + "entities": { + "media": [ + {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"}, + ] + }, + } + + mock_archiver = MagicMock() + mock_archiver.download_from_url.return_value = None # simulate failed download + + result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123") + + # The result should have no media since the download failed + assert len(result.media) == 0 + + def test_create_metadata_skips_failed_video_download(self, twitter_dropin): + """When download_from_url returns None for a video, it's not added to media.""" + tweet = { + "user": {"name": "Test User"}, + "created_at": "Sun Feb 08 18:45:00 +0000 2026", + "full_text": "Test tweet with video", + "entities": { + "media": [ + { + "type": "video", + "video_info": { + "variants": [ + { + "url": "https://video.twimg.com/vid/1280x720/test.mp4", + "content_type": "video/mp4", + }, + ] + }, + }, + ] + }, + } + + mock_archiver = MagicMock() + mock_archiver.download_from_url.return_value = None + + result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123") + + assert len(result.media) == 0 + + def test_create_metadata_keeps_successful_download(self, twitter_dropin, tmp_path): + """When download_from_url succeeds, media is added.""" + tweet = { + "user": {"name": "Test User"}, + "created_at": "Sun Feb 08 18:45:00 +0000 2026", + "full_text": "Test tweet with photo", + "entities": { + "media": [ + {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"}, + ] + }, + } + + test_file = tmp_path / "test.jpg" + test_file.write_text("fake image data") + + mock_archiver = MagicMock() + mock_archiver.download_from_url.return_value = str(test_file) + + result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123") + + assert len(result.media) == 1 + assert result.media[0].filename == str(test_file) + + def test_create_metadata_mixed_downloads(self, twitter_dropin, tmp_path): + """One download succeeds, one fails – only successful one is kept.""" + tweet = { + "user": {"name": "Test User"}, + "created_at": "Sun Feb 08 18:45:00 +0000 2026", + "full_text": "Test tweet with two photos", + "entities": { + "media": [ + {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test1.jpg"}, + {"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test2.jpg"}, + ] + }, + } + + test_file = tmp_path / "test1.jpg" + test_file.write_text("fake image data") + + mock_archiver = MagicMock() + # First call succeeds, second fails + mock_archiver.download_from_url.side_effect = [str(test_file), None] + + result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123") + + assert len(result.media) == 1 + assert result.media[0].filename == str(test_file)