Refactoring of storage code:

1. Fix some bugs in local_storage
2. Refactor unit tests to not set Media.key explicitly (unless it's well-known beforehand, which it isn't)
3. Limit length of URL for 'url' type path_generator
4. Throw an error if 'save_to' of local storage is too long
5. A few other tidyups
This commit is contained in:
Patrick Robertson
2025-03-10 16:39:31 +00:00
parent e89a8da3b4
commit 770f4c8a3d
15 changed files with 74 additions and 96 deletions

View File

@@ -14,7 +14,6 @@ from loguru import logger
from copy import deepcopy from copy import deepcopy
from auto_archiver.core.consts import MODULE_TYPES from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple
_yaml: YAML = YAML() _yaml: YAML = YAML()

View File

@@ -1,3 +1,5 @@
class SetupError(ValueError):
pass
MODULE_TYPES = [ MODULE_TYPES = [
'feeder', 'feeder',

View File

@@ -81,7 +81,8 @@ class Extractor(BaseModule):
if len(to_filename) > 64: if len(to_filename) > 64:
to_filename = to_filename[-64:] to_filename = to_filename[-64:]
to_filename = os.path.join(self.tmp_dir, to_filename) to_filename = os.path.join(self.tmp_dir, to_filename)
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") if verbose:
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
} }

View File

@@ -21,14 +21,13 @@ class Media:
Represents a media file with associated properties and storage details. Represents a media file with associated properties and storage details.
Attributes: Attributes:
- filename: The file path of the media. - filename: The file path of the media as saved locally (temporarily, before uploading to the storage).
- key: An optional identifier for the media.
- urls: A list of URLs where the media is stored or accessible. - urls: A list of URLs where the media is stored or accessible.
- properties: Additional metadata or transformations for the media. - properties: Additional metadata or transformations for the media.
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4). - _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
""" """
filename: str filename: str
key: str = None _key: str = None
urls: List[str] = field(default_factory=list) urls: List[str] = field(default_factory=list)
properties: dict = field(default_factory=dict) properties: dict = field(default_factory=dict)
_mimetype: str = None # eg: image/jpeg _mimetype: str = None # eg: image/jpeg
@@ -67,6 +66,10 @@ class Media:
# checks if the media is already stored in the given storage # checks if the media is already stored in the given storage
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"]) return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
@property
def key(self) -> str:
return self._key
def set(self, key: str, value: Any) -> Media: def set(self, key: str, value: Any) -> Media:
self.properties[key] = value self.properties[key] = value
return self return self

View File

@@ -23,15 +23,13 @@ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_vali
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
from .module import ModuleFactory, LazyBaseModule from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .consts import MODULE_TYPES from .consts import MODULE_TYPES, SetupError
from auto_archiver.utils.url import check_url_or_raise from auto_archiver.utils.url import check_url_or_raise
if TYPE_CHECKING: if TYPE_CHECKING:
from .base_module import BaseModule from .base_module import BaseModule
from .module import LazyBaseModule from .module import LazyBaseModule
class SetupError(ValueError):
pass
class ArchivingOrchestrator: class ArchivingOrchestrator:
# instance variables # instance variables

View File

@@ -6,6 +6,7 @@ from __future__ import annotations
from abc import abstractmethod from abc import abstractmethod
from typing import IO from typing import IO
import os import os
import platform
from loguru import logger from loguru import logger
from slugify import slugify from slugify import slugify
@@ -43,17 +44,30 @@ class Storage(BaseModule):
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
""" """
Uploads (or saves) a file to the storage service/location. Uploads (or saves) a file to the storage service/location.
This method should not be called directly, but instead through the 'store' method,
which sets up the media for storage.
""" """
pass pass
def upload(self, media: Media, **kwargs) -> bool: def upload(self, media: Media, **kwargs) -> bool:
"""
Uploads a media object to the storage service.
This method should not be called directly, but instead be called through the 'store' method,
which sets up the media for storage.
"""
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
with open(media.filename, 'rb') as f: with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs) return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, url: str, metadata: Metadata) -> None: def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
"""takes the media and optionally item info and generates a key""" """takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
if media.key is not None and len(media.key) > 0:
# media key is already set
return
folder = metadata.get_context('folder', '') folder = metadata.get_context('folder', '')
filename, ext = os.path.splitext(media.filename) filename, ext = os.path.splitext(media.filename)
@@ -61,10 +75,8 @@ class Storage(BaseModule):
path_generator = self.path_generator path_generator = self.path_generator
if path_generator == "flat": if path_generator == "flat":
path = "" path = ""
# TODO: this is never used
filename = slugify(filename) # Ensure filename is slugified
elif path_generator == "url": elif path_generator == "url":
path = slugify(url) path = slugify(url)[:70]
elif path_generator == "random": elif path_generator == "random":
path = random_str(24) path = random_str(24)
else: else:
@@ -81,25 +93,7 @@ class Storage(BaseModule):
filename = hd[:24] filename = hd[:24]
else: else:
raise ValueError(f"Invalid filename_generator: {filename_generator}") raise ValueError(f"Invalid filename_generator: {filename_generator}")
key = os.path.join(folder, path, f"{filename}{ext}") key = os.path.join(folder, path, f"{filename}{ext}")
if len(key) > self.max_file_length():
# truncate the path
max_path_length = self.max_file_length() - len(filename) - len(ext) - len(folder) - 1
path = path[:max_path_length]
logger.warning(f'Filename too long ({len(key)} characters), truncating to {self.max_file_length()} characters')
key = os.path.join(folder, path, f"{filename}{ext}")
media.key = key
def max_file_length(self) -> int:
"""
Returns the maximum length of a file name that can be stored in the storage service.
Files are truncated if they exceed this length.
Override this method in subclasses if the storage service has a different maximum file length.
"""
return 255 # safe max file length for most filesystems (macOS, Windows, Linux)
media._key = key

View File

@@ -15,7 +15,6 @@ class CLIFeeder(Feeder):
for url in urls: for url in urls:
logger.debug(f"Processing {url}") logger.debug(f"Processing {url}")
m = Metadata().set_url(url) m = Metadata().set_url(url)
m.set_context("folder", "cli")
yield m yield m
logger.success(f"Processed {len(urls)} URL(s)") logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -6,37 +6,34 @@ from loguru import logger
from auto_archiver.core import Media from auto_archiver.core import Media
from auto_archiver.core import Storage from auto_archiver.core import Storage
from auto_archiver.core.consts import SetupError
class LocalStorage(Storage): class LocalStorage(Storage):
MAX_FILE_LENGTH = 255
def setup(self) -> None:
if len(self.save_to) > 200:
raise SetupError(f"Your save_to path is long, this will cause issues saving files on your computer. Please use a shorter path")
def get_cdn_url(self, media: Media) -> str: def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename? dest = media.key
dest = os.path.join(self.save_to, media.key)
if self.save_absolute: if self.save_absolute:
dest = os.path.abspath(dest) dest = os.path.abspath(dest)
return dest return dest
def set_key(self, media, url, metadata):
# clarify we want to save the file to the save_to folder
old_folder = metadata.get('folder', '')
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
super().set_key(media, url, metadata)
# don't impact other storages that might want a different 'folder' set
metadata.set_context('folder', old_folder)
def upload(self, media: Media, **kwargs) -> bool: def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata # override parent so that we can use shutil.copy2 and keep metadata
dest = os.path.join(self.save_to, media.key) dest = media.key
if len(dest) > self.max_file_length():
old_dest_length = len(dest)
filename, ext = os.path.splitext(media.key)
dir, filename = os.path.split(filename)
# see whether we should truncate filename or dir
if len(dir) > len(filename):
dir = dir[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1]
else:
filename = filename[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1]
# override media.key
media.key = os.path.join(dir, f"{filename}{ext}")
dest = os.path.join(self.save_to, dir, f"{filename}{ext}")
logger.warning(f'Filename too long ({old_dest_length} characters), truncating to {len(dest)} characters')
os.makedirs(os.path.dirname(dest), exist_ok=True) os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
@@ -46,7 +43,5 @@ class LocalStorage(Storage):
return True return True
# must be implemented even if unused # must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
pass
def max_file_length(self):
return self.MAX_FILE_LENGTH

View File

@@ -42,7 +42,7 @@ class S3Storage(Storage):
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True return True
def is_upload_needed(self, media: Media) -> bool: def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate: if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload # checks if a folder with the hash already exists, if so it skips the upload
@@ -50,13 +50,13 @@ class S3Storage(Storage):
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path): if existing_key:=self.file_in_folder(path):
media.key = existing_key media._key = existing_key
media.set("previously archived", True) media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False return False
_, ext = os.path.splitext(media.key) _, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}") media._key = os.path.join(path, f"{random_str(24)}{ext}")
return True return True
def file_in_folder(self, path:str) -> str: def file_in_folder(self, path:str) -> str:
@@ -66,9 +66,4 @@ class S3Storage(Storage):
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1) resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp: if 'Contents' in resp:
return resp['Contents'][0]['Key'] return resp['Contents'][0]['Key']
return False return False
def max_file_length(self):
# Amazon AWS max file length is 1024, but we will use 1000 to be safe
return 1000

View File

@@ -14,8 +14,8 @@ def enricher(setup_module):
def metadata_with_images(): def metadata_with_images():
m = Metadata() m = Metadata()
m.set_url("https://example.com") m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1")) m.add_media(Media(filename="image1.jpg", _key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2")) m.add_media(Media(filename="image2.jpg", _key="image2"))
return m return m

View File

@@ -37,10 +37,10 @@ class TestS3Storage:
def test_get_cdn_url_generation(self): def test_get_cdn_url_generation(self):
"""Test CDN URL formatting """ """Test CDN URL formatting """
media = Media("test.txt") media = Media("test.txt")
media.key = "path/to/file.txt" media._key = "path/to/file.txt"
url = self.storage.get_cdn_url(media) url = self.storage.get_cdn_url(media)
assert url == "https://cdn.example.com/path/to/file.txt" assert url == "https://cdn.example.com/path/to/file.txt"
media.key = "another/path.jpg" media._key = "another/path.jpg"
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
def test_uploadf_sets_acl_public(self, mocker): def test_uploadf_sets_acl_public(self, mocker):
@@ -72,7 +72,7 @@ class TestS3Storage:
self.storage.random_no_duplicate = True self.storage.random_no_duplicate = True
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
media = Media("test.txt") media = Media("test.txt")
media.key = "original_path.txt" media._key = "original_path.txt"
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
assert self.storage.is_upload_needed(media) is False assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_folder/existing_file.txt" assert media.key == "existing_folder/existing_file.txt"
@@ -84,7 +84,7 @@ class TestS3Storage:
def test_uploads_with_correct_parameters(self, mocker): def test_uploads_with_correct_parameters(self, mocker):
media = Media("test.txt") media = Media("test.txt")
media.key = "original_key.txt" media._key = "original_key.txt"
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True) mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
media.mimetype = 'image/png' media.mimetype = 'image/png'
mock_file = mocker.MagicMock() mock_file = mocker.MagicMock()

View File

@@ -44,7 +44,7 @@ def media(tmp_path) -> Media:
file_path.write_bytes(content) file_path.write_bytes(content)
media = Media(filename=str(file_path)) media = Media(filename=str(file_path))
media.properties = {"something": "Title"} media.properties = {"something": "Title"}
media.key = "key" media._key = "key"
return media return media

View File

@@ -50,7 +50,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
def test_path_parts(): def test_path_parts():
media = Media(filename="test.jpg") media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg" media._key = "folder1/folder2/test.jpg"

View File

@@ -4,9 +4,9 @@ from pathlib import Path
import pytest import pytest
from auto_archiver.core import Media from auto_archiver.core import Media, Metadata
from auto_archiver.modules.local_storage import LocalStorage from auto_archiver.modules.local_storage import LocalStorage
from auto_archiver.core.consts import SetupError
@pytest.fixture @pytest.fixture
def local_storage(setup_module, tmp_path) -> LocalStorage: def local_storage(setup_module, tmp_path) -> LocalStorage:
@@ -25,37 +25,32 @@ def sample_media(tmp_path) -> Media:
"""Fixture creating a Media object with temporary source file""" """Fixture creating a Media object with temporary source file"""
src_file = tmp_path / "source.txt" src_file = tmp_path / "source.txt"
src_file.write_text("test content") src_file.write_text("test content")
return Media(key="subdir/test.txt", filename=str(src_file)) return Media(filename=str(src_file))
def test_really_long_website_url_save(local_storage, tmp_path): def test_too_long_save_path(setup_module):
long_filename = os.path.join(local_storage.save_to, "file"*100 + ".txt") with pytest.raises(SetupError):
src_file = tmp_path / "source.txt" setup_module("local_storage", {"save_to": "long"*100})
src_file.write_text("test content")
media = Media(key=long_filename, filename=str(src_file))
assert local_storage.upload(media) is True
assert src_file.read_text() == Path(local_storage.get_cdn_url(media)).read_text()
def test_get_cdn_url_relative(local_storage): def test_get_cdn_url_relative(local_storage):
media = Media(key="test.txt", filename="dummy.txt") media = Media(filename="dummy.txt")
local_storage.set_key(media, "https://example.com", Metadata())
expected = os.path.join(local_storage.save_to, media.key) expected = os.path.join(local_storage.save_to, media.key)
assert local_storage.get_cdn_url(media) == expected assert local_storage.get_cdn_url(media) == expected
def test_get_cdn_url_absolute(local_storage): def test_get_cdn_url_absolute(local_storage):
media = Media(key="test.txt", filename="dummy.txt") media = Media(filename="dummy.txt")
local_storage.save_absolute = True local_storage.save_absolute = True
local_storage.set_key(media, "https://example.com", Metadata())
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
assert local_storage.get_cdn_url(media) == expected assert local_storage.get_cdn_url(media) == expected
def test_upload_file_contents_and_metadata(local_storage, sample_media): def test_upload_file_contents_and_metadata(local_storage, sample_media):
local_storage.store(sample_media, "https://example.com", Metadata())
dest = os.path.join(local_storage.save_to, sample_media.key) dest = os.path.join(local_storage.save_to, sample_media.key)
assert local_storage.upload(sample_media) is True
assert Path(sample_media.filename).read_text() == Path(dest).read_text() assert Path(sample_media.filename).read_text() == Path(dest).read_text()
def test_upload_nonexistent_source(local_storage): def test_upload_nonexistent_source(local_storage):
media = Media(key="missing.txt", filename="nonexistent.txt") media = Media(_key="missing.txt", filename="nonexistent.txt")
with pytest.raises(FileNotFoundError): with pytest.raises(FileNotFoundError):
local_storage.upload(media) local_storage.upload(media)

View File

@@ -54,7 +54,7 @@ def storage_base():
], ],
) )
def test_storage_setup(storage_base, path_generator, filename_generator, url, expected_key, mocker): def test_storage_name_generation(storage_base, path_generator, filename_generator, url, expected_key, mocker):
mock_random = mocker.patch("auto_archiver.core.storage.random_str") mock_random = mocker.patch("auto_archiver.core.storage.random_str")
mock_random.return_value = "pretend-random" mock_random.return_value = "pretend-random"
@@ -92,7 +92,4 @@ def test_really_long_name(storage_base):
url = f"https://example.com/{'file'*100}" url = f"https://example.com/{'file'*100}"
media = Media(filename="dummy.txt") media = Media(filename="dummy.txt")
storage.set_key(media, url, Metadata()) storage.set_key(media, url, Metadata())
assert len(media.key) <= storage.max_file_length() assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt"
assert media.key == "https-example-com-filefilefilefilefilefilefilefilefilefilefilefile\
filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile\
filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile/6ae8a75555209fd6c44157c0.txt"