bug fix: missing filename on url download

This commit is contained in:
msramalho
2026-03-02 17:01:16 +00:00
parent 5d6c5ac2b1
commit e9a92272c5
12 changed files with 331 additions and 21 deletions

View File

@@ -181,6 +181,9 @@ class Metadata:
media_hashes = set()
new_media = []
for m in self.media:
if not m.filename:
new_media.append(m)
continue
h = m.get("hash")
if not h:
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)

View File

@@ -39,12 +39,18 @@ class Bluesky(GenericDropin):
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
for image_media in image_medias:
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
image_media = archiver.download_from_url(url)
media.append(Media(image_media))
filename = archiver.download_from_url(url)
if filename:
media.append(Media(filename))
else:
logger.warning(f"Failed to download Bluesky image from {url}")
for video_media in video_medias:
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
video_media = archiver.download_from_url(url)
media.append(Media(video_media))
filename = archiver.download_from_url(url)
if filename:
media.append(Media(filename))
else:
logger.warning(f"Failed to download Bluesky video from {url}")
return media
def _get_post_data(self, post: dict) -> dict:

View File

@@ -204,8 +204,11 @@ class GenericExtractor(Extractor):
if thumbnail_url:
try:
cover_image_path = self.download_from_url(thumbnail_url)
media = Media(cover_image_path)
metadata.add_media(media, id="cover")
if cover_image_path:
media = Media(cover_image_path)
metadata.add_media(media, id="cover")
else:
logger.warning(f"Failed to download cover image from {thumbnail_url}")
except Exception as e:
logger.error(f"Could not download cover image {thumbnail_url}: {e}")

View File

@@ -1,6 +1,7 @@
from typing import Type
from auto_archiver.utils import traverse_obj
from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.extractor import Extractor
from yt_dlp.extractor.common import InfoExtractor
@@ -58,6 +59,9 @@ class Truth(GenericDropin):
# add the media
for media in post.get("media_attachments", []):
filename = archiver.download_from_url(media["url"])
if not filename:
logger.warning(f"Failed to download media from {media['url']}")
continue
result.add_media(Media(filename), id=media.get("id"))
return result

View File

@@ -157,5 +157,8 @@ class Twitter(GenericDropin):
mimetype = variant["content_type"]
ext = mimetypes.guess_extension(mimetype)
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
if not media.filename:
logger.warning(f"Failed to download media from {media.get('src')}")
continue
result.add_media(media)
return result

View File

@@ -25,6 +25,9 @@ class HashEnricher(Enricher):
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
for i, m in enumerate(to_enrich.media):
if not m.filename:
logger.warning(f"Skipping hash for media without filename: {m}")
continue
if len(hd := self.calculate_hash(m.filename)):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")

View File

@@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor):
result.set_title(user.get("full_name", username)).set("data", user)
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
filename = self.download_from_url(pic_url)
result.add_media(Media(filename=filename), id="profile_picture")
if filename:
result.add_media(Media(filename=filename), id="profile_picture")
else:
logger.warning(f"Failed to download profile picture from {pic_url}")
count_posts = 0
if self.full_profile:
@@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor):
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
filename = self.download_from_url(cover_media)
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
if filename:
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
else:
logger.warning(f"Failed to download cover media from {cover_media}")
items = h_info.get("items", [])[::-1] # newest to oldest
items = items[: min(max_to_download, len(items))]
@@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor):
image_media = None
if image_url := item.get("thumbnail_url"):
filename = self.download_from_url(image_url, verbose=False)
image_media = Media(filename=filename)
if filename:
image_media = Media(filename=filename)
else:
logger.warning(f"Failed to download thumbnail from {image_url}")
# retrieve video info
best_id = item.get("id", item.get("pk"))
@@ -357,16 +366,19 @@ class InstagramAPIExtractor(Extractor):
if video_url := item.get("video_url"):
filename = self.download_from_url(video_url, verbose=False)
video_media = Media(filename=filename)
if taken_at:
video_media.set("date", taken_at)
if code:
video_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text:
video_media.set("text", caption_text)
video_media.set("preview", [image_media])
video_media.set("data", [item])
return item, video_media, f"{context or 'video'} {best_id}"
if filename:
video_media = Media(filename=filename)
if taken_at:
video_media.set("date", taken_at)
if code:
video_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text:
video_media.set("text", caption_text)
video_media.set("preview", [image_media])
video_media.set("data", [item])
return item, video_media, f"{context or 'video'} {best_id}"
else:
logger.warning(f"Failed to download video from {video_url}")
elif image_media:
if taken_at:
image_media.set("date", taken_at)

View File

@@ -25,6 +25,9 @@ class MetaEnricher(Enricher):
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
total_size = 0
for media in to_enrich.get_all_media():
if not media.filename:
logger.warning(f"Skipping file size for media without filename: {media}")
continue
file_stats = os.stat(media.filename)
media.set("bytes", file_stats.st_size)
media.set("size", self.human_readable_bytes(file_stats.st_size))

View File

@@ -49,10 +49,18 @@ class TelegramExtractor(Extractor):
if not len(image_urls):
return False
for img_url in image_urls:
result.add_media(Media(self.download_from_url(img_url)))
filename = self.download_from_url(img_url)
if not filename:
logger.warning(f"Failed to download image from {img_url}")
continue
result.add_media(Media(filename))
else:
video_url = video.get("src")
m_video = Media(self.download_from_url(video_url))
video_filename = self.download_from_url(video_url)
if not video_filename:
logger.warning(f"Failed to download video from {video_url}")
return False
m_video = Media(video_filename)
# extract duration from HTML
try:
duration = s.find_all("time")[0].contents[0]

View File

@@ -190,6 +190,9 @@ class TelethonExtractor(Extractor):
)
for i, om_url in enumerate(other_media_urls):
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
if not filename:
logger.warning(f"Failed to download media from {om_url}")
continue
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))

View File

@@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor):
logger.info(f"Found media {media}")
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
if not media.filename:
logger.warning(f"Failed to download media from {media.get('src')}")
continue
result.add_media(media)
result.set_content(