mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
260 lines
9.4 KiB
Python
260 lines
9.4 KiB
Python
"""
|
||
Tests for handling Media objects with None filename.
|
||
|
||
When download_from_url fails, it returns None. Various enrichers and
|
||
the metadata deduplication logic must gracefully handle Media objects
|
||
where filename is None, rather than crashing with TypeError.
|
||
"""
|
||
|
||
from datetime import datetime, timezone
|
||
from unittest.mock import MagicMock
|
||
|
||
import pytest
|
||
|
||
from auto_archiver.core.metadata import Metadata, Media
|
||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||
from auto_archiver.modules.meta_enricher import MetaEnricher
|
||
|
||
|
||
# ── HashEnricher ──────────────────────────────────────────────────────
|
||
|
||
|
||
class TestHashEnricherNoneFilename:
|
||
"""hash_enricher should skip media with None filename without crashing."""
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def setup(self, setup_module):
|
||
self.enricher = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 100})
|
||
|
||
def test_skips_none_filename(self):
|
||
m = Metadata().set_url("https://example.com")
|
||
media = Media(filename=None)
|
||
media.set("src", "https://example.com/video.mp4")
|
||
m.add_media(media)
|
||
|
||
# Should not raise
|
||
self.enricher.enrich(m)
|
||
# No hash should be set
|
||
assert m.media[0].get("hash") is None
|
||
|
||
def test_hashes_valid_skips_none(self, tmp_path):
|
||
"""Mix of valid and None-filename media: only valid ones get hashed."""
|
||
valid_file = tmp_path / "test.txt"
|
||
valid_file.write_text("hello world")
|
||
|
||
m = Metadata().set_url("https://example.com")
|
||
m.add_media(Media(filename=str(valid_file)))
|
||
m.add_media(Media(filename=None))
|
||
|
||
self.enricher.enrich(m)
|
||
|
||
assert m.media[0].get("hash") is not None
|
||
assert m.media[1].get("hash") is None
|
||
|
||
def test_all_none_filenames(self):
|
||
"""All media have None filename – enricher should not crash."""
|
||
m = Metadata().set_url("https://example.com")
|
||
m.add_media(Media(filename=None))
|
||
m.add_media(Media(filename=None))
|
||
|
||
self.enricher.enrich(m)
|
||
|
||
assert len(m.media) == 2
|
||
for media in m.media:
|
||
assert media.get("hash") is None
|
||
|
||
|
||
# ── MetaEnricher ──────────────────────────────────────────────────────
|
||
|
||
|
||
class TestMetaEnricherNoneFilename:
|
||
"""meta_enricher should skip media with None filename without crashing."""
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def setup(self, setup_module):
|
||
self.enricher = setup_module(MetaEnricher, {})
|
||
|
||
def test_skips_none_filename(self):
|
||
m = Metadata().set_url("https://example.com")
|
||
m.set("_processed_at", datetime.now(timezone.utc))
|
||
media = Media(filename=None)
|
||
media.set("src", "https://example.com/video.mp4")
|
||
m.add_media(media)
|
||
|
||
# Should not raise
|
||
self.enricher.enrich(m)
|
||
assert m.get("total_bytes") == 0
|
||
|
||
def test_sizes_valid_skips_none(self, tmp_path):
|
||
"""Mix of valid and None-filename media: only valid ones get sized."""
|
||
valid_file = tmp_path / "test.txt"
|
||
valid_file.write_text("A" * 500)
|
||
|
||
m = Metadata().set_url("https://example.com")
|
||
m.set("_processed_at", datetime.now(timezone.utc))
|
||
m.add_media(Media(filename=str(valid_file)))
|
||
m.add_media(Media(filename=None))
|
||
|
||
self.enricher.enrich(m)
|
||
|
||
assert m.media[0].get("bytes") == 500
|
||
assert m.media[1].get("bytes") is None
|
||
assert m.get("total_bytes") == 500
|
||
|
||
|
||
# ── Metadata.remove_duplicate_media_by_hash ───────────────────────────
|
||
|
||
|
||
class TestRemoveDuplicateMediaNoneFilename:
|
||
"""remove_duplicate_media_by_hash should keep media with None filename."""
|
||
|
||
def test_none_filename_kept(self):
|
||
m = Metadata().set_url("https://example.com")
|
||
none_media = Media(filename=None)
|
||
none_media.set("src", "https://example.com/video.mp4")
|
||
m.add_media(none_media)
|
||
|
||
m.remove_duplicate_media_by_hash()
|
||
|
||
assert len(m.media) == 1
|
||
assert m.media[0].filename is None
|
||
|
||
def test_none_and_valid_mixed(self, tmp_path):
|
||
"""None-filename media is kept alongside valid-filename media."""
|
||
valid_file = tmp_path / "test.txt"
|
||
valid_file.write_text("content")
|
||
|
||
m = Metadata().set_url("https://example.com")
|
||
m.add_media(Media(filename=str(valid_file)))
|
||
none_media = Media(filename=None)
|
||
none_media.set("src", "https://example.com/video.mp4")
|
||
m.add_media(none_media)
|
||
|
||
m.remove_duplicate_media_by_hash()
|
||
|
||
assert len(m.media) == 2
|
||
|
||
def test_multiple_none_filename_all_kept(self):
|
||
"""Multiple None-filename media are all kept (can't deduplicate without file)."""
|
||
m = Metadata().set_url("https://example.com")
|
||
m.add_media(Media(filename=None))
|
||
m.add_media(Media(filename=None))
|
||
|
||
m.remove_duplicate_media_by_hash()
|
||
|
||
assert len(m.media) == 2
|
||
|
||
|
||
# ── Twitter dropin create_metadata ────────────────────────────────────
|
||
|
||
|
||
class TestTwitterDropinNoneFilename:
|
||
"""Twitter dropin should skip media when download_from_url returns None."""
|
||
|
||
@pytest.fixture
|
||
def twitter_dropin(self):
|
||
from auto_archiver.modules.generic_extractor.twitter import Twitter
|
||
|
||
return Twitter()
|
||
|
||
def test_create_metadata_skips_failed_photo_download(self, twitter_dropin):
|
||
"""When download_from_url returns None for a photo, it's not added to media."""
|
||
tweet = {
|
||
"user": {"name": "Test User"},
|
||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||
"full_text": "Test tweet with photo",
|
||
"entities": {
|
||
"media": [
|
||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
|
||
]
|
||
},
|
||
}
|
||
|
||
mock_archiver = MagicMock()
|
||
mock_archiver.download_from_url.return_value = None # simulate failed download
|
||
|
||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||
|
||
# The result should have no media since the download failed
|
||
assert len(result.media) == 0
|
||
|
||
def test_create_metadata_skips_failed_video_download(self, twitter_dropin):
|
||
"""When download_from_url returns None for a video, it's not added to media."""
|
||
tweet = {
|
||
"user": {"name": "Test User"},
|
||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||
"full_text": "Test tweet with video",
|
||
"entities": {
|
||
"media": [
|
||
{
|
||
"type": "video",
|
||
"video_info": {
|
||
"variants": [
|
||
{
|
||
"url": "https://video.twimg.com/vid/1280x720/test.mp4",
|
||
"content_type": "video/mp4",
|
||
},
|
||
]
|
||
},
|
||
},
|
||
]
|
||
},
|
||
}
|
||
|
||
mock_archiver = MagicMock()
|
||
mock_archiver.download_from_url.return_value = None
|
||
|
||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||
|
||
assert len(result.media) == 0
|
||
|
||
def test_create_metadata_keeps_successful_download(self, twitter_dropin, tmp_path):
|
||
"""When download_from_url succeeds, media is added."""
|
||
tweet = {
|
||
"user": {"name": "Test User"},
|
||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||
"full_text": "Test tweet with photo",
|
||
"entities": {
|
||
"media": [
|
||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
|
||
]
|
||
},
|
||
}
|
||
|
||
test_file = tmp_path / "test.jpg"
|
||
test_file.write_text("fake image data")
|
||
|
||
mock_archiver = MagicMock()
|
||
mock_archiver.download_from_url.return_value = str(test_file)
|
||
|
||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||
|
||
assert len(result.media) == 1
|
||
assert result.media[0].filename == str(test_file)
|
||
|
||
def test_create_metadata_mixed_downloads(self, twitter_dropin, tmp_path):
|
||
"""One download succeeds, one fails – only successful one is kept."""
|
||
tweet = {
|
||
"user": {"name": "Test User"},
|
||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||
"full_text": "Test tweet with two photos",
|
||
"entities": {
|
||
"media": [
|
||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test1.jpg"},
|
||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test2.jpg"},
|
||
]
|
||
},
|
||
}
|
||
|
||
test_file = tmp_path / "test1.jpg"
|
||
test_file.write_text("fake image data")
|
||
|
||
mock_archiver = MagicMock()
|
||
# First call succeeds, second fails
|
||
mock_archiver.download_from_url.side_effect = [str(test_file), None]
|
||
|
||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||
|
||
assert len(result.media) == 1
|
||
assert result.media[0].filename == str(test_file)
|