mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
115 lines
4.1 KiB
Python
115 lines
4.1 KiB
Python
"""
|
||
Base module for Storage modules – modular components that store media objects in various locations.
|
||
|
||
If you are looking to implement a new storage module, you should subclass the `Storage` class and
|
||
implement the `get_cdn_url` and `uploadf` methods.
|
||
|
||
Your module **must** also have two config variables 'path_generator' and 'filename_generator' which
|
||
determine how the key is generated for the media object. The 'path_generator' and 'filename_generator'
|
||
variables can be set to one of the following values:
|
||
- 'flat': A flat structure with no subfolders
|
||
- 'url': A structure based on the URL of the media object
|
||
- 'random': A random structure
|
||
|
||
The 'filename_generator' variable can be set to one of the following values:
|
||
- 'random': A random string
|
||
- 'static': A replicable strategy such as a hash
|
||
|
||
If you don't want to use this naming convention, you can override the `set_key` method in your subclass.
|
||
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
from abc import abstractmethod
|
||
from typing import IO
|
||
import os
|
||
|
||
from auto_archiver.utils.custom_logger import logger
|
||
from slugify import slugify
|
||
|
||
from auto_archiver.utils.misc import random_str
|
||
|
||
from auto_archiver.core import Media, BaseModule, Metadata
|
||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||
|
||
|
||
class Storage(BaseModule):
|
||
"""
|
||
Base class for implementing storage modules in the media archiving framework.
|
||
|
||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||
"""
|
||
|
||
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||
if media.is_stored(in_storage=self):
|
||
logger.debug(f"{media.key} already stored, skipping")
|
||
return
|
||
|
||
self.set_key(media, url, metadata)
|
||
self.upload(media, metadata=metadata)
|
||
media.add_url(self.get_cdn_url(media))
|
||
|
||
@abstractmethod
|
||
def get_cdn_url(self, media: Media) -> str:
|
||
"""
|
||
Returns the URL of the media object stored in the CDN.
|
||
"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||
"""
|
||
Uploads (or saves) a file to the storage service/location.
|
||
|
||
This method should not be called directly, but instead through the 'store' method,
|
||
which sets up the media for storage.
|
||
"""
|
||
pass
|
||
|
||
def upload(self, media: Media, **kwargs) -> bool:
|
||
"""
|
||
Uploads a media object to the storage service.
|
||
|
||
This method should not be called directly, but instead be called through the 'store' method,
|
||
which sets up the media for storage.
|
||
"""
|
||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||
with open(media.filename, "rb") as f:
|
||
return self.uploadf(f, media, **kwargs)
|
||
|
||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||
"""takes the media and optionally item info and generates a key"""
|
||
|
||
if media.key is not None and len(media.key) > 0:
|
||
# media key is already set
|
||
return
|
||
|
||
folder = metadata.get_context("folder", "")
|
||
filename, ext = os.path.splitext(media.filename)
|
||
|
||
# Handle path_generator logic
|
||
path_generator = self.path_generator
|
||
if path_generator == "flat":
|
||
path = ""
|
||
elif path_generator == "url":
|
||
path = slugify(url)[:70]
|
||
elif path_generator == "random":
|
||
path = random_str(24)
|
||
else:
|
||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||
|
||
# Handle filename_generator logic
|
||
filename_generator = self.filename_generator
|
||
if filename_generator == "random":
|
||
filename = random_str(24)
|
||
elif filename_generator == "static":
|
||
# load the hash_enricher module
|
||
he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config)
|
||
hd = he.calculate_hash(media.filename)
|
||
filename = hd[:24]
|
||
else:
|
||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||
|
||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||
media._key = key
|