Merge pull request #243 from bellingcat/fix-long-path-names

Unit tests for storage types + fix storage too long issues for local storage
This commit is contained in:
Patrick Robertson
2025-03-11 10:05:09 +00:00
committed by GitHub
15 changed files with 180 additions and 51 deletions

View File

@@ -14,7 +14,6 @@ from loguru import logger
from copy import deepcopy
from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple
_yaml: YAML = YAML()

View File

@@ -1,3 +1,5 @@
class SetupError(ValueError):
pass
MODULE_TYPES = [
'feeder',

View File

@@ -81,7 +81,8 @@ class Extractor(BaseModule):
if len(to_filename) > 64:
to_filename = to_filename[-64:]
to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
if verbose:
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}

View File

@@ -6,7 +6,7 @@ nested media retrieval, and type validation.
from __future__ import annotations
import os
import traceback
from typing import Any, List
from typing import Any, List, Iterator
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
@@ -21,14 +21,13 @@ class Media:
Represents a media file with associated properties and storage details.
Attributes:
- filename: The file path of the media.
- key: An optional identifier for the media.
- filename: The file path of the media as saved locally (temporarily, before uploading to the storage).
- urls: A list of URLs where the media is stored or accessible.
- properties: Additional metadata or transformations for the media.
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
"""
filename: str
key: str = None
_key: str = None
urls: List[str] = field(default_factory=list)
properties: dict = field(default_factory=dict)
_mimetype: str = None # eg: image/jpeg
@@ -47,7 +46,7 @@ class Media:
for any_media in self.all_inner_media(include_self=True):
s.store(any_media, url, metadata=metadata)
def all_inner_media(self, include_self=False):
def all_inner_media(self, include_self=False) -> Iterator[Media]:
"""Retrieves all media, including nested media within properties or transformations on original media.
This function returns a generator for all the inner media.
@@ -67,6 +66,10 @@ class Media:
# checks if the media is already stored in the given storage
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
@property
def key(self) -> str:
return self._key
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value
return self

View File

@@ -23,15 +23,13 @@ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_vali
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .consts import MODULE_TYPES
from .consts import MODULE_TYPES, SetupError
from auto_archiver.utils.url import check_url_or_raise
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
class SetupError(ValueError):
pass
class ArchivingOrchestrator:
# instance variables

View File

@@ -1,11 +1,29 @@
"""
Base module for Storage modules modular components that store media objects in various locations.
If you are looking to implement a new storage module, you should subclass the `Storage` class and
implement the `get_cdn_url` and `uploadf` methods.
Your module **must** also have two config variables 'path_generator' and 'filename_generator' which
determine how the key is generated for the media object. The 'path_generator' and 'filename_generator'
variables can be set to one of the following values:
- 'flat': A flat structure with no subfolders
- 'url': A structure based on the URL of the media object
- 'random': A random structure
The 'filename_generator' variable can be set to one of the following values:
- 'random': A random string
- 'static': A replicable strategy such as a hash
If you don't want to use this naming convention, you can override the `set_key` method in your subclass.
"""
from __future__ import annotations
from abc import abstractmethod
from typing import IO
import os
import platform
from loguru import logger
from slugify import slugify
@@ -27,6 +45,7 @@ class Storage(BaseModule):
if media.is_stored(in_storage=self):
logger.debug(f"{media.key} already stored, skipping")
return
self.set_key(media, url, metadata)
self.upload(media, metadata=metadata)
media.add_url(self.get_cdn_url(media))
@@ -42,42 +61,56 @@ class Storage(BaseModule):
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
"""
Uploads (or saves) a file to the storage service/location.
This method should not be called directly, but instead through the 'store' method,
which sets up the media for storage.
"""
pass
def upload(self, media: Media, **kwargs) -> bool:
"""
Uploads a media object to the storage service.
This method should not be called directly, but instead be called through the 'store' method,
which sets up the media for storage.
"""
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, url, metadata: Metadata) -> None:
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
if media.key is not None and len(media.key) > 0:
# media key is already set
return
folder = metadata.get_context('folder', '')
filename, ext = os.path.splitext(media.filename)
# Handle path_generator logic
path_generator = self.config.get("path_generator", "url")
path_generator = self.path_generator
if path_generator == "flat":
path = ""
filename = slugify(filename) # Ensure filename is slugified
elif path_generator == "url":
path = slugify(url)
path = slugify(url)[:70]
elif path_generator == "random":
path = self.config.get("random_path", random_str(24), True)
path = random_str(24)
else:
raise ValueError(f"Invalid path_generator: {path_generator}")
# Handle filename_generator logic
filename_generator = self.config.get("filename_generator", "random")
filename_generator = self.filename_generator
if filename_generator == "random":
filename = random_str(24)
elif filename_generator == "static":
# load the hash_enricher module
he = self.module_factory.get_module(HashEnricher, self.config)
he = self.module_factory.get_module("hash_enricher", self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:
raise ValueError(f"Invalid filename_generator: {filename_generator}")
key = os.path.join(folder, path, f"{filename}{ext}")
media.key = os.path.join(folder, path, f"{filename}{ext}")
media._key = key

View File

@@ -15,7 +15,6 @@ class CLIFeeder(Feeder):
for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
m.set_context("folder", "cli")
yield m
logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -6,25 +6,42 @@ from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.core.consts import SetupError
class LocalStorage(Storage):
def setup(self) -> None:
if len(self.save_to) > 200:
raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)
dest = media.key
if self.save_absolute:
dest = os.path.abspath(dest)
return dest
def set_key(self, media, url, metadata):
# clarify we want to save the file to the save_to folder
old_folder = metadata.get('folder', '')
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
super().set_key(media, url, metadata)
# don't impact other storages that might want a different 'folder' set
metadata.set_context('folder', old_folder)
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
dest = os.path.join(self.save_to, media.key)
dest = media.key
os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
res = shutil.copy2(media.filename, dest)
logger.info(res)
return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
pass

View File

@@ -42,7 +42,7 @@ class S3Storage(Storage):
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
@@ -50,13 +50,13 @@ class S3Storage(Storage):
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
media._key = existing_key
media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
media._key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
@@ -66,5 +66,4 @@ class S3Storage(Storage):
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp:
return resp['Contents'][0]['Key']
return False
return False