mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
bug fix: missing filename on url download
This commit is contained in:
@@ -181,6 +181,9 @@ class Metadata:
|
||||
media_hashes = set()
|
||||
new_media = []
|
||||
for m in self.media:
|
||||
if not m.filename:
|
||||
new_media.append(m)
|
||||
continue
|
||||
h = m.get("hash")
|
||||
if not h:
|
||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
|
||||
@@ -39,12 +39,18 @@ class Bluesky(GenericDropin):
|
||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||
for image_media in image_medias:
|
||||
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
||||
image_media = archiver.download_from_url(url)
|
||||
media.append(Media(image_media))
|
||||
filename = archiver.download_from_url(url)
|
||||
if filename:
|
||||
media.append(Media(filename))
|
||||
else:
|
||||
logger.warning(f"Failed to download Bluesky image from {url}")
|
||||
for video_media in video_medias:
|
||||
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
||||
video_media = archiver.download_from_url(url)
|
||||
media.append(Media(video_media))
|
||||
filename = archiver.download_from_url(url)
|
||||
if filename:
|
||||
media.append(Media(filename))
|
||||
else:
|
||||
logger.warning(f"Failed to download Bluesky video from {url}")
|
||||
return media
|
||||
|
||||
def _get_post_data(self, post: dict) -> dict:
|
||||
|
||||
@@ -204,8 +204,11 @@ class GenericExtractor(Extractor):
|
||||
if thumbnail_url:
|
||||
try:
|
||||
cover_image_path = self.download_from_url(thumbnail_url)
|
||||
media = Media(cover_image_path)
|
||||
metadata.add_media(media, id="cover")
|
||||
if cover_image_path:
|
||||
media = Media(cover_image_path)
|
||||
metadata.add_media(media, id="cover")
|
||||
else:
|
||||
logger.warning(f"Failed to download cover image from {thumbnail_url}")
|
||||
except Exception as e:
|
||||
logger.error(f"Could not download cover image {thumbnail_url}: {e}")
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Type
|
||||
|
||||
from auto_archiver.utils import traverse_obj
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
@@ -58,6 +59,9 @@ class Truth(GenericDropin):
|
||||
# add the media
|
||||
for media in post.get("media_attachments", []):
|
||||
filename = archiver.download_from_url(media["url"])
|
||||
if not filename:
|
||||
logger.warning(f"Failed to download media from {media['url']}")
|
||||
continue
|
||||
result.add_media(Media(filename), id=media.get("id"))
|
||||
|
||||
return result
|
||||
|
||||
@@ -157,5 +157,8 @@ class Twitter(GenericDropin):
|
||||
mimetype = variant["content_type"]
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
if not media.filename:
|
||||
logger.warning(f"Failed to download media from {media.get('src')}")
|
||||
continue
|
||||
result.add_media(media)
|
||||
return result
|
||||
|
||||
@@ -25,6 +25,9 @@ class HashEnricher(Enricher):
|
||||
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if not m.filename:
|
||||
logger.warning(f"Skipping hash for media without filename: {m}")
|
||||
continue
|
||||
if len(hd := self.calculate_hash(m.filename)):
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
result.set_title(user.get("full_name", username)).set("data", user)
|
||||
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
||||
filename = self.download_from_url(pic_url)
|
||||
result.add_media(Media(filename=filename), id="profile_picture")
|
||||
if filename:
|
||||
result.add_media(Media(filename=filename), id="profile_picture")
|
||||
else:
|
||||
logger.warning(f"Failed to download profile picture from {pic_url}")
|
||||
|
||||
count_posts = 0
|
||||
if self.full_profile:
|
||||
@@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
|
||||
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
|
||||
filename = self.download_from_url(cover_media)
|
||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||
if filename:
|
||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||
else:
|
||||
logger.warning(f"Failed to download cover media from {cover_media}")
|
||||
|
||||
items = h_info.get("items", [])[::-1] # newest to oldest
|
||||
items = items[: min(max_to_download, len(items))]
|
||||
@@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
image_media = None
|
||||
if image_url := item.get("thumbnail_url"):
|
||||
filename = self.download_from_url(image_url, verbose=False)
|
||||
image_media = Media(filename=filename)
|
||||
if filename:
|
||||
image_media = Media(filename=filename)
|
||||
else:
|
||||
logger.warning(f"Failed to download thumbnail from {image_url}")
|
||||
|
||||
# retrieve video info
|
||||
best_id = item.get("id", item.get("pk"))
|
||||
@@ -357,16 +366,19 @@ class InstagramAPIExtractor(Extractor):
|
||||
|
||||
if video_url := item.get("video_url"):
|
||||
filename = self.download_from_url(video_url, verbose=False)
|
||||
video_media = Media(filename=filename)
|
||||
if taken_at:
|
||||
video_media.set("date", taken_at)
|
||||
if code:
|
||||
video_media.set("url", f"https://www.instagram.com/p/{code}")
|
||||
if caption_text:
|
||||
video_media.set("text", caption_text)
|
||||
video_media.set("preview", [image_media])
|
||||
video_media.set("data", [item])
|
||||
return item, video_media, f"{context or 'video'} {best_id}"
|
||||
if filename:
|
||||
video_media = Media(filename=filename)
|
||||
if taken_at:
|
||||
video_media.set("date", taken_at)
|
||||
if code:
|
||||
video_media.set("url", f"https://www.instagram.com/p/{code}")
|
||||
if caption_text:
|
||||
video_media.set("text", caption_text)
|
||||
video_media.set("preview", [image_media])
|
||||
video_media.set("data", [item])
|
||||
return item, video_media, f"{context or 'video'} {best_id}"
|
||||
else:
|
||||
logger.warning(f"Failed to download video from {video_url}")
|
||||
elif image_media:
|
||||
if taken_at:
|
||||
image_media.set("date", taken_at)
|
||||
|
||||
@@ -25,6 +25,9 @@ class MetaEnricher(Enricher):
|
||||
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
|
||||
total_size = 0
|
||||
for media in to_enrich.get_all_media():
|
||||
if not media.filename:
|
||||
logger.warning(f"Skipping file size for media without filename: {media}")
|
||||
continue
|
||||
file_stats = os.stat(media.filename)
|
||||
media.set("bytes", file_stats.st_size)
|
||||
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
||||
|
||||
@@ -49,10 +49,18 @@ class TelegramExtractor(Extractor):
|
||||
if not len(image_urls):
|
||||
return False
|
||||
for img_url in image_urls:
|
||||
result.add_media(Media(self.download_from_url(img_url)))
|
||||
filename = self.download_from_url(img_url)
|
||||
if not filename:
|
||||
logger.warning(f"Failed to download image from {img_url}")
|
||||
continue
|
||||
result.add_media(Media(filename))
|
||||
else:
|
||||
video_url = video.get("src")
|
||||
m_video = Media(self.download_from_url(video_url))
|
||||
video_filename = self.download_from_url(video_url)
|
||||
if not video_filename:
|
||||
logger.warning(f"Failed to download video from {video_url}")
|
||||
return False
|
||||
m_video = Media(video_filename)
|
||||
# extract duration from HTML
|
||||
try:
|
||||
duration = s.find_all("time")[0].contents[0]
|
||||
|
||||
@@ -190,6 +190,9 @@ class TelethonExtractor(Extractor):
|
||||
)
|
||||
for i, om_url in enumerate(other_media_urls):
|
||||
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
|
||||
if not filename:
|
||||
logger.warning(f"Failed to download media from {om_url}")
|
||||
continue
|
||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||
|
||||
filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))
|
||||
|
||||
@@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor):
|
||||
logger.info(f"Found media {media}")
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
if not media.filename:
|
||||
logger.warning(f"Failed to download media from {media.get('src')}")
|
||||
continue
|
||||
result.add_media(media)
|
||||
|
||||
result.set_content(
|
||||
|
||||
Reference in New Issue
Block a user