mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Unit tests for storage types + fix storage too long issues for local storage
This commit is contained in:
@@ -6,7 +6,7 @@ nested media retrieval, and type validation.
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import traceback
|
||||
from typing import Any, List
|
||||
from typing import Any, List, Iterator
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
@@ -47,7 +47,7 @@ class Media:
|
||||
for any_media in self.all_inner_media(include_self=True):
|
||||
s.store(any_media, url, metadata=metadata)
|
||||
|
||||
def all_inner_media(self, include_self=False):
|
||||
def all_inner_media(self, include_self=False) -> Iterator[Media]:
|
||||
"""Retrieves all media, including nested media within properties or transformations on original media.
|
||||
This function returns a generator for all the inner media.
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ class Storage(BaseModule):
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
|
||||
self.set_key(media, url, metadata)
|
||||
self.upload(media, metadata=metadata)
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
@@ -50,34 +51,55 @@ class Storage(BaseModule):
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url, metadata: Metadata) -> None:
|
||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = metadata.get_context('folder', '')
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
path_generator = self.config.get("path_generator", "url")
|
||||
path_generator = self.path_generator
|
||||
if path_generator == "flat":
|
||||
path = ""
|
||||
# TODO: this is never used
|
||||
filename = slugify(filename) # Ensure filename is slugified
|
||||
elif path_generator == "url":
|
||||
path = slugify(url)
|
||||
elif path_generator == "random":
|
||||
path = self.config.get("random_path", random_str(24), True)
|
||||
path = random_str(24)
|
||||
else:
|
||||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||||
|
||||
# Handle filename_generator logic
|
||||
filename_generator = self.config.get("filename_generator", "random")
|
||||
filename_generator = self.filename_generator
|
||||
if filename_generator == "random":
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = self.module_factory.get_module(HashEnricher, self.config)
|
||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
if len(key) > self.max_file_length():
|
||||
# truncate the path
|
||||
max_path_length = self.max_file_length() - len(filename) - len(ext) - len(folder) - 1
|
||||
path = path[:max_path_length]
|
||||
logger.warning(f'Filename too long ({len(key)} characters), truncating to {self.max_file_length()} characters')
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
|
||||
media.key = key
|
||||
|
||||
|
||||
def max_file_length(self) -> int:
|
||||
"""
|
||||
Returns the maximum length of a file name that can be stored in the storage service.
|
||||
|
||||
Files are truncated if they exceed this length.
|
||||
Override this method in subclasses if the storage service has a different maximum file length.
|
||||
"""
|
||||
return 255 # safe max file length for most filesystems (macOS, Windows, Linux)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user