Merge pull request #407 from bellingcat/dev

minor bug fix: handles failed get downloads
This commit is contained in:
Miguel Sozinho Ramalho
2026-03-02 17:10:46 +00:00
committed by GitHub
15 changed files with 335 additions and 22 deletions

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "1.2.3"
version = "1.2.4"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"

View File

@@ -181,6 +181,9 @@ class Metadata:
media_hashes = set()
new_media = []
for m in self.media:
if not m.filename:
new_media.append(m)
continue
h = m.get("hash")
if not h:
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)

View File

@@ -73,6 +73,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
if self.enrich(result):
result.status = "antibot"
return result
return False
def _prepare_user_data_dir(self):
if self.user_data_dir:

View File

@@ -39,12 +39,18 @@ class Bluesky(GenericDropin):
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
for image_media in image_medias:
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
image_media = archiver.download_from_url(url)
media.append(Media(image_media))
filename = archiver.download_from_url(url)
if filename:
media.append(Media(filename))
else:
logger.warning(f"Failed to download Bluesky image from {url}")
for video_media in video_medias:
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
video_media = archiver.download_from_url(url)
media.append(Media(video_media))
filename = archiver.download_from_url(url)
if filename:
media.append(Media(filename))
else:
logger.warning(f"Failed to download Bluesky video from {url}")
return media
def _get_post_data(self, post: dict) -> dict:

View File

@@ -204,8 +204,11 @@ class GenericExtractor(Extractor):
if thumbnail_url:
try:
cover_image_path = self.download_from_url(thumbnail_url)
if cover_image_path:
media = Media(cover_image_path)
metadata.add_media(media, id="cover")
else:
logger.warning(f"Failed to download cover image from {thumbnail_url}")
except Exception as e:
logger.error(f"Could not download cover image {thumbnail_url}: {e}")

View File

@@ -1,6 +1,7 @@
from typing import Type
from auto_archiver.utils import traverse_obj
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.extractor import Extractor
from yt_dlp.extractor.common import InfoExtractor
@@ -58,6 +59,9 @@ class Truth(GenericDropin):
# add the media
for media in post.get("media_attachments", []):
filename = archiver.download_from_url(media["url"])
if not filename:
logger.warning(f"Failed to download media from {media['url']}")
continue
result.add_media(Media(filename), id=media.get("id"))
return result

View File

@@ -157,5 +157,8 @@ class Twitter(GenericDropin):
mimetype = variant["content_type"]
ext = mimetypes.guess_extension(mimetype)
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
if not media.filename:
logger.warning(f"Failed to download media from {media.get('src')}")
continue
result.add_media(media)
return result

View File

@@ -25,6 +25,9 @@ class HashEnricher(Enricher):
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
for i, m in enumerate(to_enrich.media):
if not m.filename:
logger.warning(f"Skipping hash for media without filename: {m}")
continue
if len(hd := self.calculate_hash(m.filename)):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")

View File

@@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor):
result.set_title(user.get("full_name", username)).set("data", user)
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
filename = self.download_from_url(pic_url)
if filename:
result.add_media(Media(filename=filename), id="profile_picture")
else:
logger.warning(f"Failed to download profile picture from {pic_url}")
count_posts = 0
if self.full_profile:
@@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor):
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
filename = self.download_from_url(cover_media)
if filename:
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
else:
logger.warning(f"Failed to download cover media from {cover_media}")
items = h_info.get("items", [])[::-1] # newest to oldest
items = items[: min(max_to_download, len(items))]
@@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor):
image_media = None
if image_url := item.get("thumbnail_url"):
filename = self.download_from_url(image_url, verbose=False)
if filename:
image_media = Media(filename=filename)
else:
logger.warning(f"Failed to download thumbnail from {image_url}")
# retrieve video info
best_id = item.get("id", item.get("pk"))
@@ -357,6 +366,7 @@ class InstagramAPIExtractor(Extractor):
if video_url := item.get("video_url"):
filename = self.download_from_url(video_url, verbose=False)
if filename:
video_media = Media(filename=filename)
if taken_at:
video_media.set("date", taken_at)
@@ -367,6 +377,8 @@ class InstagramAPIExtractor(Extractor):
video_media.set("preview", [image_media])
video_media.set("data", [item])
return item, video_media, f"{context or 'video'} {best_id}"
else:
logger.warning(f"Failed to download video from {video_url}")
elif image_media:
if taken_at:
image_media.set("date", taken_at)

View File

@@ -25,6 +25,9 @@ class MetaEnricher(Enricher):
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
total_size = 0
for media in to_enrich.get_all_media():
if not media.filename:
logger.warning(f"Skipping file size for media without filename: {media}")
continue
file_stats = os.stat(media.filename)
media.set("bytes", file_stats.st_size)
media.set("size", self.human_readable_bytes(file_stats.st_size))

View File

@@ -49,10 +49,18 @@ class TelegramExtractor(Extractor):
if not len(image_urls):
return False
for img_url in image_urls:
result.add_media(Media(self.download_from_url(img_url)))
filename = self.download_from_url(img_url)
if not filename:
logger.warning(f"Failed to download image from {img_url}")
continue
result.add_media(Media(filename))
else:
video_url = video.get("src")
m_video = Media(self.download_from_url(video_url))
video_filename = self.download_from_url(video_url)
if not video_filename:
logger.warning(f"Failed to download video from {video_url}")
return False
m_video = Media(video_filename)
# extract duration from HTML
try:
duration = s.find_all("time")[0].contents[0]

View File

@@ -190,6 +190,9 @@ class TelethonExtractor(Extractor):
)
for i, om_url in enumerate(other_media_urls):
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
if not filename:
logger.warning(f"Failed to download media from {om_url}")
continue
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))

View File

@@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor):
logger.info(f"Found media {media}")
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
if not media.filename:
logger.warning(f"Failed to download media from {media.get('src')}")
continue
result.add_media(media)
result.set_content(

View File

@@ -53,6 +53,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
}
@pytest.mark.download
@pytest.mark.flaky(reruns=2, reruns_delay=5)
@pytest.mark.parametrize(
"url,in_title,in_text,image_count,video_count,skip_ci",
[
@@ -128,6 +129,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
item = make_item(url)
result = self.extractor.download(item)
assert result, f"download() returned {result!r} — Selenium may have failed (e.g., window close timeout)"
assert result.status == "antibot", "Expected status to be 'antibot'"
# Check title contains all required words (case-insensitive)

View File

@@ -0,0 +1,259 @@
"""
Tests for handling Media objects with None filename.
When download_from_url fails, it returns None. Various enrichers and
the metadata deduplication logic must gracefully handle Media objects
where filename is None, rather than crashing with TypeError.
"""
from datetime import datetime, timezone
from unittest.mock import MagicMock
import pytest
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.modules.meta_enricher import MetaEnricher
# ── HashEnricher ──────────────────────────────────────────────────────
class TestHashEnricherNoneFilename:
"""hash_enricher should skip media with None filename without crashing."""
@pytest.fixture(autouse=True)
def setup(self, setup_module):
self.enricher = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 100})
def test_skips_none_filename(self):
m = Metadata().set_url("https://example.com")
media = Media(filename=None)
media.set("src", "https://example.com/video.mp4")
m.add_media(media)
# Should not raise
self.enricher.enrich(m)
# No hash should be set
assert m.media[0].get("hash") is None
def test_hashes_valid_skips_none(self, tmp_path):
"""Mix of valid and None-filename media: only valid ones get hashed."""
valid_file = tmp_path / "test.txt"
valid_file.write_text("hello world")
m = Metadata().set_url("https://example.com")
m.add_media(Media(filename=str(valid_file)))
m.add_media(Media(filename=None))
self.enricher.enrich(m)
assert m.media[0].get("hash") is not None
assert m.media[1].get("hash") is None
def test_all_none_filenames(self):
"""All media have None filename enricher should not crash."""
m = Metadata().set_url("https://example.com")
m.add_media(Media(filename=None))
m.add_media(Media(filename=None))
self.enricher.enrich(m)
assert len(m.media) == 2
for media in m.media:
assert media.get("hash") is None
# ── MetaEnricher ──────────────────────────────────────────────────────
class TestMetaEnricherNoneFilename:
"""meta_enricher should skip media with None filename without crashing."""
@pytest.fixture(autouse=True)
def setup(self, setup_module):
self.enricher = setup_module(MetaEnricher, {})
def test_skips_none_filename(self):
m = Metadata().set_url("https://example.com")
m.set("_processed_at", datetime.now(timezone.utc))
media = Media(filename=None)
media.set("src", "https://example.com/video.mp4")
m.add_media(media)
# Should not raise
self.enricher.enrich(m)
assert m.get("total_bytes") == 0
def test_sizes_valid_skips_none(self, tmp_path):
"""Mix of valid and None-filename media: only valid ones get sized."""
valid_file = tmp_path / "test.txt"
valid_file.write_text("A" * 500)
m = Metadata().set_url("https://example.com")
m.set("_processed_at", datetime.now(timezone.utc))
m.add_media(Media(filename=str(valid_file)))
m.add_media(Media(filename=None))
self.enricher.enrich(m)
assert m.media[0].get("bytes") == 500
assert m.media[1].get("bytes") is None
assert m.get("total_bytes") == 500
# ── Metadata.remove_duplicate_media_by_hash ───────────────────────────
class TestRemoveDuplicateMediaNoneFilename:
"""remove_duplicate_media_by_hash should keep media with None filename."""
def test_none_filename_kept(self):
m = Metadata().set_url("https://example.com")
none_media = Media(filename=None)
none_media.set("src", "https://example.com/video.mp4")
m.add_media(none_media)
m.remove_duplicate_media_by_hash()
assert len(m.media) == 1
assert m.media[0].filename is None
def test_none_and_valid_mixed(self, tmp_path):
"""None-filename media is kept alongside valid-filename media."""
valid_file = tmp_path / "test.txt"
valid_file.write_text("content")
m = Metadata().set_url("https://example.com")
m.add_media(Media(filename=str(valid_file)))
none_media = Media(filename=None)
none_media.set("src", "https://example.com/video.mp4")
m.add_media(none_media)
m.remove_duplicate_media_by_hash()
assert len(m.media) == 2
def test_multiple_none_filename_all_kept(self):
"""Multiple None-filename media are all kept (can't deduplicate without file)."""
m = Metadata().set_url("https://example.com")
m.add_media(Media(filename=None))
m.add_media(Media(filename=None))
m.remove_duplicate_media_by_hash()
assert len(m.media) == 2
# ── Twitter dropin create_metadata ────────────────────────────────────
class TestTwitterDropinNoneFilename:
"""Twitter dropin should skip media when download_from_url returns None."""
@pytest.fixture
def twitter_dropin(self):
from auto_archiver.modules.generic_extractor.twitter import Twitter
return Twitter()
def test_create_metadata_skips_failed_photo_download(self, twitter_dropin):
"""When download_from_url returns None for a photo, it's not added to media."""
tweet = {
"user": {"name": "Test User"},
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
"full_text": "Test tweet with photo",
"entities": {
"media": [
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
]
},
}
mock_archiver = MagicMock()
mock_archiver.download_from_url.return_value = None # simulate failed download
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
# The result should have no media since the download failed
assert len(result.media) == 0
def test_create_metadata_skips_failed_video_download(self, twitter_dropin):
"""When download_from_url returns None for a video, it's not added to media."""
tweet = {
"user": {"name": "Test User"},
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
"full_text": "Test tweet with video",
"entities": {
"media": [
{
"type": "video",
"video_info": {
"variants": [
{
"url": "https://video.twimg.com/vid/1280x720/test.mp4",
"content_type": "video/mp4",
},
]
},
},
]
},
}
mock_archiver = MagicMock()
mock_archiver.download_from_url.return_value = None
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
assert len(result.media) == 0
def test_create_metadata_keeps_successful_download(self, twitter_dropin, tmp_path):
"""When download_from_url succeeds, media is added."""
tweet = {
"user": {"name": "Test User"},
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
"full_text": "Test tweet with photo",
"entities": {
"media": [
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
]
},
}
test_file = tmp_path / "test.jpg"
test_file.write_text("fake image data")
mock_archiver = MagicMock()
mock_archiver.download_from_url.return_value = str(test_file)
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
assert len(result.media) == 1
assert result.media[0].filename == str(test_file)
def test_create_metadata_mixed_downloads(self, twitter_dropin, tmp_path):
"""One download succeeds, one fails only successful one is kept."""
tweet = {
"user": {"name": "Test User"},
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
"full_text": "Test tweet with two photos",
"entities": {
"media": [
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test1.jpg"},
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test2.jpg"},
]
},
}
test_file = tmp_path / "test1.jpg"
test_file.write_text("fake image data")
mock_archiver = MagicMock()
# First call succeeds, second fails
mock_archiver.download_from_url.side_effect = [str(test_file), None]
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
assert len(result.media) == 1
assert result.media[0].filename == str(test_file)