mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
Merge pull request #407 from bellingcat/dev
minor bug fix: handles failed get downloads
This commit is contained in:
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.2.3"
|
version = "1.2.4"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
|
|||||||
@@ -181,6 +181,9 @@ class Metadata:
|
|||||||
media_hashes = set()
|
media_hashes = set()
|
||||||
new_media = []
|
new_media = []
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
|
if not m.filename:
|
||||||
|
new_media.append(m)
|
||||||
|
continue
|
||||||
h = m.get("hash")
|
h = m.get("hash")
|
||||||
if not h:
|
if not h:
|
||||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
if self.enrich(result):
|
if self.enrich(result):
|
||||||
result.status = "antibot"
|
result.status = "antibot"
|
||||||
return result
|
return result
|
||||||
|
return False
|
||||||
|
|
||||||
def _prepare_user_data_dir(self):
|
def _prepare_user_data_dir(self):
|
||||||
if self.user_data_dir:
|
if self.user_data_dir:
|
||||||
|
|||||||
@@ -39,12 +39,18 @@ class Bluesky(GenericDropin):
|
|||||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||||
for image_media in image_medias:
|
for image_media in image_medias:
|
||||||
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
||||||
image_media = archiver.download_from_url(url)
|
filename = archiver.download_from_url(url)
|
||||||
media.append(Media(image_media))
|
if filename:
|
||||||
|
media.append(Media(filename))
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download Bluesky image from {url}")
|
||||||
for video_media in video_medias:
|
for video_media in video_medias:
|
||||||
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
||||||
video_media = archiver.download_from_url(url)
|
filename = archiver.download_from_url(url)
|
||||||
media.append(Media(video_media))
|
if filename:
|
||||||
|
media.append(Media(filename))
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download Bluesky video from {url}")
|
||||||
return media
|
return media
|
||||||
|
|
||||||
def _get_post_data(self, post: dict) -> dict:
|
def _get_post_data(self, post: dict) -> dict:
|
||||||
|
|||||||
@@ -204,8 +204,11 @@ class GenericExtractor(Extractor):
|
|||||||
if thumbnail_url:
|
if thumbnail_url:
|
||||||
try:
|
try:
|
||||||
cover_image_path = self.download_from_url(thumbnail_url)
|
cover_image_path = self.download_from_url(thumbnail_url)
|
||||||
media = Media(cover_image_path)
|
if cover_image_path:
|
||||||
metadata.add_media(media, id="cover")
|
media = Media(cover_image_path)
|
||||||
|
metadata.add_media(media, id="cover")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download cover image from {thumbnail_url}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Could not download cover image {thumbnail_url}: {e}")
|
logger.error(f"Could not download cover image {thumbnail_url}: {e}")
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
from auto_archiver.utils import traverse_obj
|
from auto_archiver.utils import traverse_obj
|
||||||
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from yt_dlp.extractor.common import InfoExtractor
|
from yt_dlp.extractor.common import InfoExtractor
|
||||||
@@ -58,6 +59,9 @@ class Truth(GenericDropin):
|
|||||||
# add the media
|
# add the media
|
||||||
for media in post.get("media_attachments", []):
|
for media in post.get("media_attachments", []):
|
||||||
filename = archiver.download_from_url(media["url"])
|
filename = archiver.download_from_url(media["url"])
|
||||||
|
if not filename:
|
||||||
|
logger.warning(f"Failed to download media from {media['url']}")
|
||||||
|
continue
|
||||||
result.add_media(Media(filename), id=media.get("id"))
|
result.add_media(Media(filename), id=media.get("id"))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -157,5 +157,8 @@ class Twitter(GenericDropin):
|
|||||||
mimetype = variant["content_type"]
|
mimetype = variant["content_type"]
|
||||||
ext = mimetypes.guess_extension(mimetype)
|
ext = mimetypes.guess_extension(mimetype)
|
||||||
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||||
|
if not media.filename:
|
||||||
|
logger.warning(f"Failed to download media from {media.get('src')}")
|
||||||
|
continue
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -25,6 +25,9 @@ class HashEnricher(Enricher):
|
|||||||
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
|
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
|
if not m.filename:
|
||||||
|
logger.warning(f"Skipping hash for media without filename: {m}")
|
||||||
|
continue
|
||||||
if len(hd := self.calculate_hash(m.filename)):
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||||
|
|
||||||
|
|||||||
@@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
result.set_title(user.get("full_name", username)).set("data", user)
|
result.set_title(user.get("full_name", username)).set("data", user)
|
||||||
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
||||||
filename = self.download_from_url(pic_url)
|
filename = self.download_from_url(pic_url)
|
||||||
result.add_media(Media(filename=filename), id="profile_picture")
|
if filename:
|
||||||
|
result.add_media(Media(filename=filename), id="profile_picture")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download profile picture from {pic_url}")
|
||||||
|
|
||||||
count_posts = 0
|
count_posts = 0
|
||||||
if self.full_profile:
|
if self.full_profile:
|
||||||
@@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
|
|
||||||
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
|
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
|
||||||
filename = self.download_from_url(cover_media)
|
filename = self.download_from_url(cover_media)
|
||||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
if filename:
|
||||||
|
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download cover media from {cover_media}")
|
||||||
|
|
||||||
items = h_info.get("items", [])[::-1] # newest to oldest
|
items = h_info.get("items", [])[::-1] # newest to oldest
|
||||||
items = items[: min(max_to_download, len(items))]
|
items = items[: min(max_to_download, len(items))]
|
||||||
@@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
image_media = None
|
image_media = None
|
||||||
if image_url := item.get("thumbnail_url"):
|
if image_url := item.get("thumbnail_url"):
|
||||||
filename = self.download_from_url(image_url, verbose=False)
|
filename = self.download_from_url(image_url, verbose=False)
|
||||||
image_media = Media(filename=filename)
|
if filename:
|
||||||
|
image_media = Media(filename=filename)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download thumbnail from {image_url}")
|
||||||
|
|
||||||
# retrieve video info
|
# retrieve video info
|
||||||
best_id = item.get("id", item.get("pk"))
|
best_id = item.get("id", item.get("pk"))
|
||||||
@@ -357,16 +366,19 @@ class InstagramAPIExtractor(Extractor):
|
|||||||
|
|
||||||
if video_url := item.get("video_url"):
|
if video_url := item.get("video_url"):
|
||||||
filename = self.download_from_url(video_url, verbose=False)
|
filename = self.download_from_url(video_url, verbose=False)
|
||||||
video_media = Media(filename=filename)
|
if filename:
|
||||||
if taken_at:
|
video_media = Media(filename=filename)
|
||||||
video_media.set("date", taken_at)
|
if taken_at:
|
||||||
if code:
|
video_media.set("date", taken_at)
|
||||||
video_media.set("url", f"https://www.instagram.com/p/{code}")
|
if code:
|
||||||
if caption_text:
|
video_media.set("url", f"https://www.instagram.com/p/{code}")
|
||||||
video_media.set("text", caption_text)
|
if caption_text:
|
||||||
video_media.set("preview", [image_media])
|
video_media.set("text", caption_text)
|
||||||
video_media.set("data", [item])
|
video_media.set("preview", [image_media])
|
||||||
return item, video_media, f"{context or 'video'} {best_id}"
|
video_media.set("data", [item])
|
||||||
|
return item, video_media, f"{context or 'video'} {best_id}"
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to download video from {video_url}")
|
||||||
elif image_media:
|
elif image_media:
|
||||||
if taken_at:
|
if taken_at:
|
||||||
image_media.set("date", taken_at)
|
image_media.set("date", taken_at)
|
||||||
|
|||||||
@@ -25,6 +25,9 @@ class MetaEnricher(Enricher):
|
|||||||
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
|
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
|
||||||
total_size = 0
|
total_size = 0
|
||||||
for media in to_enrich.get_all_media():
|
for media in to_enrich.get_all_media():
|
||||||
|
if not media.filename:
|
||||||
|
logger.warning(f"Skipping file size for media without filename: {media}")
|
||||||
|
continue
|
||||||
file_stats = os.stat(media.filename)
|
file_stats = os.stat(media.filename)
|
||||||
media.set("bytes", file_stats.st_size)
|
media.set("bytes", file_stats.st_size)
|
||||||
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
||||||
|
|||||||
@@ -49,10 +49,18 @@ class TelegramExtractor(Extractor):
|
|||||||
if not len(image_urls):
|
if not len(image_urls):
|
||||||
return False
|
return False
|
||||||
for img_url in image_urls:
|
for img_url in image_urls:
|
||||||
result.add_media(Media(self.download_from_url(img_url)))
|
filename = self.download_from_url(img_url)
|
||||||
|
if not filename:
|
||||||
|
logger.warning(f"Failed to download image from {img_url}")
|
||||||
|
continue
|
||||||
|
result.add_media(Media(filename))
|
||||||
else:
|
else:
|
||||||
video_url = video.get("src")
|
video_url = video.get("src")
|
||||||
m_video = Media(self.download_from_url(video_url))
|
video_filename = self.download_from_url(video_url)
|
||||||
|
if not video_filename:
|
||||||
|
logger.warning(f"Failed to download video from {video_url}")
|
||||||
|
return False
|
||||||
|
m_video = Media(video_filename)
|
||||||
# extract duration from HTML
|
# extract duration from HTML
|
||||||
try:
|
try:
|
||||||
duration = s.find_all("time")[0].contents[0]
|
duration = s.find_all("time")[0].contents[0]
|
||||||
|
|||||||
@@ -190,6 +190,9 @@ class TelethonExtractor(Extractor):
|
|||||||
)
|
)
|
||||||
for i, om_url in enumerate(other_media_urls):
|
for i, om_url in enumerate(other_media_urls):
|
||||||
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
|
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
|
||||||
|
if not filename:
|
||||||
|
logger.warning(f"Failed to download media from {om_url}")
|
||||||
|
continue
|
||||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||||
|
|
||||||
filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))
|
filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))
|
||||||
|
|||||||
@@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor):
|
|||||||
logger.info(f"Found media {media}")
|
logger.info(f"Found media {media}")
|
||||||
ext = mimetypes.guess_extension(mimetype)
|
ext = mimetypes.guess_extension(mimetype)
|
||||||
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||||
|
if not media.filename:
|
||||||
|
logger.warning(f"Failed to download media from {media.get('src')}")
|
||||||
|
continue
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
|
|
||||||
result.set_content(
|
result.set_content(
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
}
|
}
|
||||||
|
|
||||||
@pytest.mark.download
|
@pytest.mark.download
|
||||||
|
@pytest.mark.flaky(reruns=2, reruns_delay=5)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url,in_title,in_text,image_count,video_count,skip_ci",
|
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||||
[
|
[
|
||||||
@@ -128,6 +129,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
item = make_item(url)
|
item = make_item(url)
|
||||||
result = self.extractor.download(item)
|
result = self.extractor.download(item)
|
||||||
|
|
||||||
|
assert result, f"download() returned {result!r} — Selenium may have failed (e.g., window close timeout)"
|
||||||
assert result.status == "antibot", "Expected status to be 'antibot'"
|
assert result.status == "antibot", "Expected status to be 'antibot'"
|
||||||
|
|
||||||
# Check title contains all required words (case-insensitive)
|
# Check title contains all required words (case-insensitive)
|
||||||
|
|||||||
259
tests/test_none_filename_handling.py
Normal file
259
tests/test_none_filename_handling.py
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
"""
|
||||||
|
Tests for handling Media objects with None filename.
|
||||||
|
|
||||||
|
When download_from_url fails, it returns None. Various enrichers and
|
||||||
|
the metadata deduplication logic must gracefully handle Media objects
|
||||||
|
where filename is None, rather than crashing with TypeError.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
|
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||||
|
from auto_archiver.modules.meta_enricher import MetaEnricher
|
||||||
|
|
||||||
|
|
||||||
|
# ── HashEnricher ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestHashEnricherNoneFilename:
|
||||||
|
"""hash_enricher should skip media with None filename without crashing."""
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup(self, setup_module):
|
||||||
|
self.enricher = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 100})
|
||||||
|
|
||||||
|
def test_skips_none_filename(self):
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
media = Media(filename=None)
|
||||||
|
media.set("src", "https://example.com/video.mp4")
|
||||||
|
m.add_media(media)
|
||||||
|
|
||||||
|
# Should not raise
|
||||||
|
self.enricher.enrich(m)
|
||||||
|
# No hash should be set
|
||||||
|
assert m.media[0].get("hash") is None
|
||||||
|
|
||||||
|
def test_hashes_valid_skips_none(self, tmp_path):
|
||||||
|
"""Mix of valid and None-filename media: only valid ones get hashed."""
|
||||||
|
valid_file = tmp_path / "test.txt"
|
||||||
|
valid_file.write_text("hello world")
|
||||||
|
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
m.add_media(Media(filename=str(valid_file)))
|
||||||
|
m.add_media(Media(filename=None))
|
||||||
|
|
||||||
|
self.enricher.enrich(m)
|
||||||
|
|
||||||
|
assert m.media[0].get("hash") is not None
|
||||||
|
assert m.media[1].get("hash") is None
|
||||||
|
|
||||||
|
def test_all_none_filenames(self):
|
||||||
|
"""All media have None filename – enricher should not crash."""
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
m.add_media(Media(filename=None))
|
||||||
|
m.add_media(Media(filename=None))
|
||||||
|
|
||||||
|
self.enricher.enrich(m)
|
||||||
|
|
||||||
|
assert len(m.media) == 2
|
||||||
|
for media in m.media:
|
||||||
|
assert media.get("hash") is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── MetaEnricher ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestMetaEnricherNoneFilename:
|
||||||
|
"""meta_enricher should skip media with None filename without crashing."""
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup(self, setup_module):
|
||||||
|
self.enricher = setup_module(MetaEnricher, {})
|
||||||
|
|
||||||
|
def test_skips_none_filename(self):
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
m.set("_processed_at", datetime.now(timezone.utc))
|
||||||
|
media = Media(filename=None)
|
||||||
|
media.set("src", "https://example.com/video.mp4")
|
||||||
|
m.add_media(media)
|
||||||
|
|
||||||
|
# Should not raise
|
||||||
|
self.enricher.enrich(m)
|
||||||
|
assert m.get("total_bytes") == 0
|
||||||
|
|
||||||
|
def test_sizes_valid_skips_none(self, tmp_path):
|
||||||
|
"""Mix of valid and None-filename media: only valid ones get sized."""
|
||||||
|
valid_file = tmp_path / "test.txt"
|
||||||
|
valid_file.write_text("A" * 500)
|
||||||
|
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
m.set("_processed_at", datetime.now(timezone.utc))
|
||||||
|
m.add_media(Media(filename=str(valid_file)))
|
||||||
|
m.add_media(Media(filename=None))
|
||||||
|
|
||||||
|
self.enricher.enrich(m)
|
||||||
|
|
||||||
|
assert m.media[0].get("bytes") == 500
|
||||||
|
assert m.media[1].get("bytes") is None
|
||||||
|
assert m.get("total_bytes") == 500
|
||||||
|
|
||||||
|
|
||||||
|
# ── Metadata.remove_duplicate_media_by_hash ───────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoveDuplicateMediaNoneFilename:
|
||||||
|
"""remove_duplicate_media_by_hash should keep media with None filename."""
|
||||||
|
|
||||||
|
def test_none_filename_kept(self):
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
none_media = Media(filename=None)
|
||||||
|
none_media.set("src", "https://example.com/video.mp4")
|
||||||
|
m.add_media(none_media)
|
||||||
|
|
||||||
|
m.remove_duplicate_media_by_hash()
|
||||||
|
|
||||||
|
assert len(m.media) == 1
|
||||||
|
assert m.media[0].filename is None
|
||||||
|
|
||||||
|
def test_none_and_valid_mixed(self, tmp_path):
|
||||||
|
"""None-filename media is kept alongside valid-filename media."""
|
||||||
|
valid_file = tmp_path / "test.txt"
|
||||||
|
valid_file.write_text("content")
|
||||||
|
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
m.add_media(Media(filename=str(valid_file)))
|
||||||
|
none_media = Media(filename=None)
|
||||||
|
none_media.set("src", "https://example.com/video.mp4")
|
||||||
|
m.add_media(none_media)
|
||||||
|
|
||||||
|
m.remove_duplicate_media_by_hash()
|
||||||
|
|
||||||
|
assert len(m.media) == 2
|
||||||
|
|
||||||
|
def test_multiple_none_filename_all_kept(self):
|
||||||
|
"""Multiple None-filename media are all kept (can't deduplicate without file)."""
|
||||||
|
m = Metadata().set_url("https://example.com")
|
||||||
|
m.add_media(Media(filename=None))
|
||||||
|
m.add_media(Media(filename=None))
|
||||||
|
|
||||||
|
m.remove_duplicate_media_by_hash()
|
||||||
|
|
||||||
|
assert len(m.media) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ── Twitter dropin create_metadata ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestTwitterDropinNoneFilename:
|
||||||
|
"""Twitter dropin should skip media when download_from_url returns None."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def twitter_dropin(self):
|
||||||
|
from auto_archiver.modules.generic_extractor.twitter import Twitter
|
||||||
|
|
||||||
|
return Twitter()
|
||||||
|
|
||||||
|
def test_create_metadata_skips_failed_photo_download(self, twitter_dropin):
|
||||||
|
"""When download_from_url returns None for a photo, it's not added to media."""
|
||||||
|
tweet = {
|
||||||
|
"user": {"name": "Test User"},
|
||||||
|
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||||
|
"full_text": "Test tweet with photo",
|
||||||
|
"entities": {
|
||||||
|
"media": [
|
||||||
|
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_archiver = MagicMock()
|
||||||
|
mock_archiver.download_from_url.return_value = None # simulate failed download
|
||||||
|
|
||||||
|
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||||
|
|
||||||
|
# The result should have no media since the download failed
|
||||||
|
assert len(result.media) == 0
|
||||||
|
|
||||||
|
def test_create_metadata_skips_failed_video_download(self, twitter_dropin):
|
||||||
|
"""When download_from_url returns None for a video, it's not added to media."""
|
||||||
|
tweet = {
|
||||||
|
"user": {"name": "Test User"},
|
||||||
|
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||||
|
"full_text": "Test tweet with video",
|
||||||
|
"entities": {
|
||||||
|
"media": [
|
||||||
|
{
|
||||||
|
"type": "video",
|
||||||
|
"video_info": {
|
||||||
|
"variants": [
|
||||||
|
{
|
||||||
|
"url": "https://video.twimg.com/vid/1280x720/test.mp4",
|
||||||
|
"content_type": "video/mp4",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_archiver = MagicMock()
|
||||||
|
mock_archiver.download_from_url.return_value = None
|
||||||
|
|
||||||
|
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||||
|
|
||||||
|
assert len(result.media) == 0
|
||||||
|
|
||||||
|
def test_create_metadata_keeps_successful_download(self, twitter_dropin, tmp_path):
|
||||||
|
"""When download_from_url succeeds, media is added."""
|
||||||
|
tweet = {
|
||||||
|
"user": {"name": "Test User"},
|
||||||
|
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||||
|
"full_text": "Test tweet with photo",
|
||||||
|
"entities": {
|
||||||
|
"media": [
|
||||||
|
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
test_file = tmp_path / "test.jpg"
|
||||||
|
test_file.write_text("fake image data")
|
||||||
|
|
||||||
|
mock_archiver = MagicMock()
|
||||||
|
mock_archiver.download_from_url.return_value = str(test_file)
|
||||||
|
|
||||||
|
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||||
|
|
||||||
|
assert len(result.media) == 1
|
||||||
|
assert result.media[0].filename == str(test_file)
|
||||||
|
|
||||||
|
def test_create_metadata_mixed_downloads(self, twitter_dropin, tmp_path):
|
||||||
|
"""One download succeeds, one fails – only successful one is kept."""
|
||||||
|
tweet = {
|
||||||
|
"user": {"name": "Test User"},
|
||||||
|
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||||
|
"full_text": "Test tweet with two photos",
|
||||||
|
"entities": {
|
||||||
|
"media": [
|
||||||
|
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test1.jpg"},
|
||||||
|
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test2.jpg"},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
test_file = tmp_path / "test1.jpg"
|
||||||
|
test_file.write_text("fake image data")
|
||||||
|
|
||||||
|
mock_archiver = MagicMock()
|
||||||
|
# First call succeeds, second fails
|
||||||
|
mock_archiver.download_from_url.side_effect = [str(test_file), None]
|
||||||
|
|
||||||
|
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||||
|
|
||||||
|
assert len(result.media) == 1
|
||||||
|
assert result.media[0].filename == str(test_file)
|
||||||
Reference in New Issue
Block a user